1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
|
"""The Extractor classes transform content into triples.
Part of the bsie module.
A copy of the license is provided with the project.
Author: Matthias Baumgartner, 2022
"""
# imports
import abc
import typing
# bsie imports
from bsie.utils import node
from bsie.utils.bsfs import schema as _schema, typename
# exports
__all__: typing.Sequence[str] = (
'Extractor',
)
# constants
# essential definitions typically used in extractor schemas.
# NOTE: This preamble is only for convenience; Each Extractor must implement its use, if so desired.
SCHEMA_PREAMBLE = '''
# common external prefixes
prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
prefix xsd: <http://www.w3.org/2001/XMLSchema#>
prefix schema: <http://schema.org/>
# common bsfs prefixes
prefix bsfs: <http://bsfs.ai/schema/>
prefix bse: <http://bsfs.ai/schema/Entity#>
# essential nodes
bsfs:Entity rdfs:subClassOf bsfs:Node .
# common definitions
xsd:string rdfs:subClassOf bsfs:Literal .
xsd:integer rdfs:subClassOf bsfs:Literal .
'''
## code ##
class Extractor(abc.ABC):
"""Produce (subject, predicate, value)-triples from some content.
The Extractor produces princpal predicates that provide information
about the content itself (i.e., triples that include the subject),
and may also generate triples with auxiliary predicates if the
extracted value is a node itself.
"""
# what type of content is expected (i.e. reader subclass).
CONTENT_READER: typing.Optional[str] = None
# extractor schema.
schema: _schema.Schema
def __init__(self, schema: _schema.Schema):
self.schema = schema
def __str__(self) -> str:
return typename(self)
def __repr__(self) -> str:
return f'{typename(self)}()'
def __eq__(self, other: typing.Any) -> bool:
return isinstance(other, type(self)) \
and self.CONTENT_READER == other.CONTENT_READER \
and self.schema == other.schema
def __hash__(self) -> int:
return hash((type(self), self.CONTENT_READER, self.schema))
def predicates(self) -> typing.Iterator[_schema.Predicate]:
"""Return the predicates that may be part of extracted triples."""
# NOTE: Some predicates in the schema might not occur in actual triples,
# but are defined due to predicate class hierarchy. E.g., bsfs:Predicate
# is part of every schema but should not be used in triples.
# Announcing all predicates might not be the most efficient way, however,
# it is the most safe one. Concrete extractors that produce additional
# predicates (e.g. auxiliary nodes with their own predicates) should
# overwrite this method to only include the principal predicates.
return self.schema.predicates()
@abc.abstractmethod
def extract(
self,
subject: node.Node,
content: typing.Any,
predicates: typing.Iterable[_schema.Predicate],
) -> typing.Iterator[typing.Tuple[node.Node, _schema.Predicate, typing.Any]]:
"""Return (node, predicate, value) triples."""
## EOF ##
|