diff options
author | Matthias Baumgartner <dev@igsor.net> | 2023-07-28 11:31:24 +0200 |
---|---|---|
committer | Matthias Baumgartner <dev@igsor.net> | 2023-07-28 11:31:24 +0200 |
commit | 11b26a913d39edb7f36cd0a3b3d8e74c96738579 (patch) | |
tree | 463af082c0b77916c11a84263c96fc91ebedabed /bsie/extractor/text | |
parent | 28e3640e0b5e03b50bf66711f46937f07a3d7fef (diff) | |
download | bsie-11b26a913d39edb7f36cd0a3b3d8e74c96738579.tar.gz bsie-11b26a913d39edb7f36cd0a3b3d8e74c96738579.tar.bz2 bsie-11b26a913d39edb7f36cd0a3b3d8e74c96738579.zip |
document digestion:
* plaintext reader
* text metrics extractor
* text summary extractor
Diffstat (limited to 'bsie/extractor/text')
-rw-r--r-- | bsie/extractor/text/__init__.py | 8 | ||||
-rw-r--r-- | bsie/extractor/text/metrics.py | 100 | ||||
-rw-r--r-- | bsie/extractor/text/summary.py | 79 |
3 files changed, 187 insertions, 0 deletions
diff --git a/bsie/extractor/text/__init__.py b/bsie/extractor/text/__init__.py new file mode 100644 index 0000000..f82424a --- /dev/null +++ b/bsie/extractor/text/__init__.py @@ -0,0 +1,8 @@ + +# standard imports +import typing + +# exports +__all__: typing.Sequence[str] = [] + +## EOF ## diff --git a/bsie/extractor/text/metrics.py b/bsie/extractor/text/metrics.py new file mode 100644 index 0000000..ddb943f --- /dev/null +++ b/bsie/extractor/text/metrics.py @@ -0,0 +1,100 @@ + +# standard imports +from collections import Counter +import math +import typing + +# bsie imports +from bsie.extractor import base +from bsie.matcher import nodes +from bsie.utils import bsfs, ns + +# exports +__all__: typing.Sequence[str] = ( + 'TextMetrics', + ) + + +## code ## + +log2 = lambda x: math.log(x) / math.log(2) + +class TextMetrics(base.Extractor): + """Extract text metrics (character, word, and line counts) from a document.""" + + CONTENT_READER = 'bsie.reader.document.Document' + + _callmap: typing.Dict[bsfs.schema.Predicate, typing.Callable[[str], typing.Any]] + + def __init__(self): + super().__init__(bsfs.schema.from_string(base.SCHEMA_PREAMBLE + ''' + bse:num_characters rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Entity ; + rdfs:range xsd:integer ; + bsfs:unique "true"^^xsd:boolean . + + bse:num_paragraphs rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Entity ; + rdfs:range xsd:integer ; + bsfs:unique "true"^^xsd:boolean . + + bse:num_words rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Entity ; + rdfs:range xsd:integer ; + bsfs:unique "true"^^xsd:boolean . + + bse:vocabulary_size rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Entity ; + rdfs:range xsd:integer ; + bsfs:unique "true"^^xsd:boolean . + + bse:vocabulary_entropy rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Entity ; + rdfs:range xsd:float ; + bsfs:unique "true"^^xsd:boolean . + ''')) + self._callmap = { + self.schema.predicate(ns.bse.num_characters): self.__num_characters, + self.schema.predicate(ns.bse.num_paragraphs): self.__num_paragraphs, + self.schema.predicate(ns.bse.num_words): self.__num_words, + self.schema.predicate(ns.bse.vocabulary_size): self.__vocab_size, + self.schema.predicate(ns.bse.vocabulary_entropy): self.__entropy, + } + + def extract( + self, + subject: nodes.Entity, + content: typing.Sequence[str], + principals: typing.Iterable[bsfs.schema.Predicate], + ) -> typing.Iterator[typing.Tuple[nodes.Entity, bsfs.schema.Predicate, typing.Any]]: + for pred in principals: + # find callback + clbk = self._callmap.get(pred) + if clbk is None: + continue + # produce triple + yield subject, pred, clbk(content) + + def __num_words(self, text: typing.Sequence[str]) -> int: + return sum([len(paragraph.split()) for paragraph in text]) + + def __num_characters(self, text: typing.Sequence[str]) -> int: + return sum([len(paragraph) for paragraph in text]) + + def __num_paragraphs(self, text: typing.Sequence[str]) -> int: + return len(text) + + def __vocab_size(self, text: typing.Sequence[str]) -> int: + return sum({len(paragraph.split()) for paragraph in text}) + + def __entropy(self, text: typing.Sequence[str]) -> float: + words = [word for paragraph in text for word in paragraph.split() ] + word_histogram = Counter(words) + num_words = len(words) + return -sum( + word_prob / num_words * log2(word_prob / num_words) + for word_prob + in word_histogram.values() + ) + +## EOF ## diff --git a/bsie/extractor/text/summary.py b/bsie/extractor/text/summary.py new file mode 100644 index 0000000..cc8d90d --- /dev/null +++ b/bsie/extractor/text/summary.py @@ -0,0 +1,79 @@ + +# standard imports +import typing + +# external imports +import transformers + +# bsie imports +from bsie.extractor import base +from bsie.matcher import nodes +from bsie.utils import bsfs, errors, ns + +# exports +__all__: typing.Sequence[str] = ( + 'Language', + ) + + +## code ## + +class Summary(base.Extractor): + """Extract a text summary. + + Uses the following summarization model: + https://huggingface.co/Joemgu/mlong-t5-large-sumstew + + """ + + CONTENT_READER = 'bsie.reader.document.Document' + + _predicate: bsfs.schema.Predicate + + _summarizer: transformers.pipelines.text2text_generation.SummarizationPipeline + + def __init__( + self, + max_length: int = 1024, # summary length in tokens + num_beams: int = 4, # higher = better, but uses more memory + length_penalty: float = 1.0, # higher = longer summaries + ): + super().__init__(bsfs.schema.from_string(base.SCHEMA_PREAMBLE + ''' + bse:summary rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Entity ; + rdfs:range xsd:string ; + bsfs:unique "true"^^xsd:boolean . + ''')) + self._predicate = self.schema.predicate(ns.bse.summary) + self._generator_kwargs = dict( + max_length=max_length, + num_beams=num_beams, + length_penalty=length_penalty, + ) + self._summarizer = transformers.pipeline( + "summarization", + model="joemgu/mlong-t5-large-sumstew", + ) + + def extract( + self, + subject: nodes.Entity, + content: typing.Sequence[str], + principals: typing.Iterable[bsfs.schema.Predicate], + ) -> typing.Iterator[typing.Tuple[nodes.Entity, bsfs.schema.Predicate, str]]: + # check predicates + if self._predicate not in principals: + return + # preprocess + text = '\n'.join(content) + # generate summary + summaries = self._summarizer(text, **self._generator_kwargs) + if len(summaries) == 0: + return + # fetch summary, ignore title + prefix = 'Summary: ' + title_and_summary = summaries[0]['summary_text'] + summary = title_and_summary[title_and_summary.find(prefix) + len(prefix):] + yield subject, self._predicate, summary + +## EOF ## |