aboutsummaryrefslogtreecommitdiffstats
path: root/bsie
diff options
context:
space:
mode:
Diffstat (limited to 'bsie')
-rw-r--r--bsie/extractor/text/__init__.py8
-rw-r--r--bsie/extractor/text/metrics.py100
-rw-r--r--bsie/extractor/text/summary.py79
-rw-r--r--bsie/reader/document/__init__.py32
-rw-r--r--bsie/reader/document/_plain.py38
5 files changed, 257 insertions, 0 deletions
diff --git a/bsie/extractor/text/__init__.py b/bsie/extractor/text/__init__.py
new file mode 100644
index 0000000..f82424a
--- /dev/null
+++ b/bsie/extractor/text/__init__.py
@@ -0,0 +1,8 @@
+
+# standard imports
+import typing
+
+# exports
+__all__: typing.Sequence[str] = []
+
+## EOF ##
diff --git a/bsie/extractor/text/metrics.py b/bsie/extractor/text/metrics.py
new file mode 100644
index 0000000..ddb943f
--- /dev/null
+++ b/bsie/extractor/text/metrics.py
@@ -0,0 +1,100 @@
+
+# standard imports
+from collections import Counter
+import math
+import typing
+
+# bsie imports
+from bsie.extractor import base
+from bsie.matcher import nodes
+from bsie.utils import bsfs, ns
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'TextMetrics',
+ )
+
+
+## code ##
+
+log2 = lambda x: math.log(x) / math.log(2)
+
+class TextMetrics(base.Extractor):
+ """Extract text metrics (character, word, and line counts) from a document."""
+
+ CONTENT_READER = 'bsie.reader.document.Document'
+
+ _callmap: typing.Dict[bsfs.schema.Predicate, typing.Callable[[str], typing.Any]]
+
+ def __init__(self):
+ super().__init__(bsfs.schema.from_string(base.SCHEMA_PREAMBLE + '''
+ bse:num_characters rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsn:Entity ;
+ rdfs:range xsd:integer ;
+ bsfs:unique "true"^^xsd:boolean .
+
+ bse:num_paragraphs rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsn:Entity ;
+ rdfs:range xsd:integer ;
+ bsfs:unique "true"^^xsd:boolean .
+
+ bse:num_words rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsn:Entity ;
+ rdfs:range xsd:integer ;
+ bsfs:unique "true"^^xsd:boolean .
+
+ bse:vocabulary_size rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsn:Entity ;
+ rdfs:range xsd:integer ;
+ bsfs:unique "true"^^xsd:boolean .
+
+ bse:vocabulary_entropy rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsn:Entity ;
+ rdfs:range xsd:float ;
+ bsfs:unique "true"^^xsd:boolean .
+ '''))
+ self._callmap = {
+ self.schema.predicate(ns.bse.num_characters): self.__num_characters,
+ self.schema.predicate(ns.bse.num_paragraphs): self.__num_paragraphs,
+ self.schema.predicate(ns.bse.num_words): self.__num_words,
+ self.schema.predicate(ns.bse.vocabulary_size): self.__vocab_size,
+ self.schema.predicate(ns.bse.vocabulary_entropy): self.__entropy,
+ }
+
+ def extract(
+ self,
+ subject: nodes.Entity,
+ content: typing.Sequence[str],
+ principals: typing.Iterable[bsfs.schema.Predicate],
+ ) -> typing.Iterator[typing.Tuple[nodes.Entity, bsfs.schema.Predicate, typing.Any]]:
+ for pred in principals:
+ # find callback
+ clbk = self._callmap.get(pred)
+ if clbk is None:
+ continue
+ # produce triple
+ yield subject, pred, clbk(content)
+
+ def __num_words(self, text: typing.Sequence[str]) -> int:
+ return sum([len(paragraph.split()) for paragraph in text])
+
+ def __num_characters(self, text: typing.Sequence[str]) -> int:
+ return sum([len(paragraph) for paragraph in text])
+
+ def __num_paragraphs(self, text: typing.Sequence[str]) -> int:
+ return len(text)
+
+ def __vocab_size(self, text: typing.Sequence[str]) -> int:
+ return sum({len(paragraph.split()) for paragraph in text})
+
+ def __entropy(self, text: typing.Sequence[str]) -> float:
+ words = [word for paragraph in text for word in paragraph.split() ]
+ word_histogram = Counter(words)
+ num_words = len(words)
+ return -sum(
+ word_prob / num_words * log2(word_prob / num_words)
+ for word_prob
+ in word_histogram.values()
+ )
+
+## EOF ##
diff --git a/bsie/extractor/text/summary.py b/bsie/extractor/text/summary.py
new file mode 100644
index 0000000..cc8d90d
--- /dev/null
+++ b/bsie/extractor/text/summary.py
@@ -0,0 +1,79 @@
+
+# standard imports
+import typing
+
+# external imports
+import transformers
+
+# bsie imports
+from bsie.extractor import base
+from bsie.matcher import nodes
+from bsie.utils import bsfs, errors, ns
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'Language',
+ )
+
+
+## code ##
+
+class Summary(base.Extractor):
+ """Extract a text summary.
+
+ Uses the following summarization model:
+ https://huggingface.co/Joemgu/mlong-t5-large-sumstew
+
+ """
+
+ CONTENT_READER = 'bsie.reader.document.Document'
+
+ _predicate: bsfs.schema.Predicate
+
+ _summarizer: transformers.pipelines.text2text_generation.SummarizationPipeline
+
+ def __init__(
+ self,
+ max_length: int = 1024, # summary length in tokens
+ num_beams: int = 4, # higher = better, but uses more memory
+ length_penalty: float = 1.0, # higher = longer summaries
+ ):
+ super().__init__(bsfs.schema.from_string(base.SCHEMA_PREAMBLE + '''
+ bse:summary rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsn:Entity ;
+ rdfs:range xsd:string ;
+ bsfs:unique "true"^^xsd:boolean .
+ '''))
+ self._predicate = self.schema.predicate(ns.bse.summary)
+ self._generator_kwargs = dict(
+ max_length=max_length,
+ num_beams=num_beams,
+ length_penalty=length_penalty,
+ )
+ self._summarizer = transformers.pipeline(
+ "summarization",
+ model="joemgu/mlong-t5-large-sumstew",
+ )
+
+ def extract(
+ self,
+ subject: nodes.Entity,
+ content: typing.Sequence[str],
+ principals: typing.Iterable[bsfs.schema.Predicate],
+ ) -> typing.Iterator[typing.Tuple[nodes.Entity, bsfs.schema.Predicate, str]]:
+ # check predicates
+ if self._predicate not in principals:
+ return
+ # preprocess
+ text = '\n'.join(content)
+ # generate summary
+ summaries = self._summarizer(text, **self._generator_kwargs)
+ if len(summaries) == 0:
+ return
+ # fetch summary, ignore title
+ prefix = 'Summary: '
+ title_and_summary = summaries[0]['summary_text']
+ summary = title_and_summary[title_and_summary.find(prefix) + len(prefix):]
+ yield subject, self._predicate, summary
+
+## EOF ##
diff --git a/bsie/reader/document/__init__.py b/bsie/reader/document/__init__.py
new file mode 100644
index 0000000..4ae3613
--- /dev/null
+++ b/bsie/reader/document/__init__.py
@@ -0,0 +1,32 @@
+
+# standard imports
+import typing
+
+# inner-module imports
+from .. import chain
+
+# constants
+_FILE_FORMAT_READERS: typing.Sequence[str] = (
+ #__package__ + '._docx.Docx',
+ #__package__ + '._odt.ODT',
+ #__package__ + '._pdf.PDF',
+ #__package__ + '._rtf.RTF',
+ #__package__ + '._ps.PS',
+ __package__ + '._plain.Plain',
+ )
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'Document'
+ )
+
+
+## code ##
+
+class Document(chain.ReaderChain[typing.Sequence[str]]):
+ """Read paragraphs from a text file."""
+
+ def __init__(self, cfg: typing.Optional[typing.Any] = None):
+ super().__init__(_FILE_FORMAT_READERS, cfg)
+
+## EOF ##
diff --git a/bsie/reader/document/_plain.py b/bsie/reader/document/_plain.py
new file mode 100644
index 0000000..a589265
--- /dev/null
+++ b/bsie/reader/document/_plain.py
@@ -0,0 +1,38 @@
+
+# standard imports
+import typing
+
+# bsie imports
+from bsie.utils import errors, filematcher
+
+# inner-module imports
+from .. import base
+
+# constants
+MATCH_RULE = 'mime=text/plain'
+
+# exports
+__all__: typing.Sequence[str] = (
+ 'Plain',
+ )
+
+
+## code ##
+
+class Plain(base.Reader):
+
+ _match: filematcher.Matcher
+
+ def __init__(self):
+ self._match = filematcher.parse(MATCH_RULE)
+
+ def __call__(self, path: str) -> typing.Sequence[str]:
+ # perform quick checks first
+ if not self._match(path):
+ raise errors.UnsupportedFileFormatError(path)
+
+ # open file in text mode
+ with open(path, 'rt') as ifile:
+ return [line.strip() for line in ifile.read().split('\n') if len(line.strip()) > 0]
+
+## EOF ##