1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
|
"""The Extractor classes transform content into triples.
Part of the bsie module.
A copy of the license is provided with the project.
Author: Matthias Baumgartner, 2022
"""
# imports
import abc
import typing
# bsie imports
from bsie.utils import node
from bsie.utils.bsfs import schema as _schema, typename
# exports
__all__: typing.Sequence[str] = (
'Extractor',
)
# constants
# essential definitions typically used in extractor schemas.
# NOTE: The definition here is only for convenience; Each Extractor must implement its use, if so desired.
SCHEMA_PREAMBLE = '''
# common external prefixes
prefix owl: <http://www.w3.org/2002/07/owl#>
prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
prefix xsd: <http://www.w3.org/2001/XMLSchema#>
prefix schema: <http://schema.org/>
# common bsfs prefixes
prefix bsfs: <http://bsfs.ai/schema/>
prefix bse: <http://bsfs.ai/schema/Entity#>
# essential nodes
bsfs:Entity rdfs:subClassOf bsfs:Node .
# common definitions
xsd:string rdfs:subClassOf bsfs:Literal .
xsd:integer rdfs:subClassOf bsfs:Literal .
'''
## code ##
class Extractor(abc.ABC):
"""Produce (node, predicate, value)-triples from some content."""
# what type of content is expected (i.e. reader subclass).
CONTENT_READER: typing.Optional[str] = None
# extractor schema.
schema: _schema.Schema
def __init__(self, schema: _schema.Schema):
self.schema = schema
def __str__(self) -> str:
return typename(self)
def __repr__(self) -> str:
return f'{typename(self)}()'
def __eq__(self, other: typing.Any) -> bool:
return isinstance(other, type(self)) \
and self.CONTENT_READER == other.CONTENT_READER \
and self.schema == other.schema
def __hash__(self) -> int:
return hash((type(self), self.CONTENT_READER, self.schema))
def predicates(self) -> typing.Iterator[_schema.Predicate]:
"""Return the predicates that may be part of extracted triples."""
# NOTE: Some predicates in the schema might not occur in actual triples,
# but are defined due to predicate class hierarchy. E.g., bsfs:Predicate
# is part of every schema but should not be used in triples.
# Announcing all predicates might not be the most efficient way, however,
# it is the most safe one. Concrete extractors that produce additional
# predicates (e.g. auxiliary nodes with their own predicates) should
# overwrite this method to only include the principal predicates.
return self.schema.predicates()
@abc.abstractmethod
def extract(
self,
subject: node.Node,
content: typing.Any,
predicates: typing.Iterable[_schema.Predicate],
) -> typing.Iterator[typing.Tuple[node.Node, _schema.Predicate, typing.Any]]:
"""Return (node, predicate, value) triples."""
## EOF ##
|