1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
|
"""The Extractor classes transform content into triples.
Part of the bsie module.
A copy of the license is provided with the project.
Author: Matthias Baumgartner, 2022
"""
# standard imports
import abc
import typing
# bsie imports
from bsie.utils import bsfs, node, ns
# exports
__all__: typing.Sequence[str] = (
'Extractor',
)
# constants
# essential definitions typically used in extractor schemas.
# NOTE: This preamble is only for convenience; Each Extractor must implement its use, if so desired.
SCHEMA_PREAMBLE = '''
# common external prefixes
prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
prefix xsd: <http://www.w3.org/2001/XMLSchema#>
prefix schema: <http://schema.org/>
# common bsfs prefixes
prefix bsfs: <http://bsfs.ai/schema/>
prefix bse: <http://bsfs.ai/schema/Entity#>
prefix bsp: <http://bsfs.ai/schema/Preview#>
# default definitions
bsfs:Array rdfs:subClassOf bsfs:Literal .
bsfs:Number rdfs:subClassOf bsfs:Literal .
bsfs:Time rdfs:subClassOf bsfs:Literal .
bsfs:Feature rdfs:subClassOf bsfs:Array ;
bsfs:dimension "1"^^xsd:integer ;
bsfs:dtype bsfs:f16 ;
bsfs:distance bsfs:euclidean .
# essential nodes
bsfs:Entity rdfs:subClassOf bsfs:Node .
bsfs:File rdfs:subClassOf bsfs:Entity .
# common definitions
xsd:string rdfs:subClassOf bsfs:Literal .
xsd:integer rdfs:subClassOf bsfs:Number .
xsd:float rdfs:subClassOf bsfs:Number .
'''
## code ##
class Extractor(abc.ABC):
"""Produce (subject, predicate, value)-triples from some content.
The Extractor produces princpal predicates that provide information
about the content itself (i.e., triples that include the subject),
and may also generate triples with auxiliary predicates if the
extracted value is a node itself.
"""
# what type of content is expected (i.e. reader subclass).
CONTENT_READER: typing.Optional[str] = None
# extractor schema.
_schema: bsfs.schema.Schema
def __init__(self, schema: bsfs.schema.Schema):
self._schema = schema
def __str__(self) -> str:
return bsfs.typename(self)
def __repr__(self) -> str:
return f'{bsfs.typename(self)}()'
def __eq__(self, other: typing.Any) -> bool:
return isinstance(other, type(self)) \
and self.CONTENT_READER == other.CONTENT_READER \
and self.schema == other.schema
def __hash__(self) -> int:
return hash((type(self), self.CONTENT_READER, self.schema))
@property
def schema(self) -> bsfs.schema.Schema:
"""Return the extractor's schema."""
return self._schema
@property
def principals(self) -> typing.Iterator[bsfs.schema.Predicate]:
"""Return the principal predicates, i.e., relations from/to the extraction subject."""
ent = self.schema.node(ns.bsfs.Entity)
return (
pred
for pred
in self.schema.predicates()
if pred.domain <= ent or (pred.range is not None and pred.range <= ent)
)
@abc.abstractmethod
def extract(
self,
subject: node.Node,
content: typing.Any,
principals: typing.Iterable[bsfs.schema.Predicate],
) -> typing.Iterator[typing.Tuple[node.Node, bsfs.schema.Predicate, typing.Any]]:
"""Return (node, predicate, value) triples."""
# FIXME: type annotation could be more strict: value is Hashable
## EOF ##
|