From 11b26a913d39edb7f36cd0a3b3d8e74c96738579 Mon Sep 17 00:00:00 2001 From: Matthias Baumgartner Date: Fri, 28 Jul 2023 11:31:24 +0200 Subject: document digestion: * plaintext reader * text metrics extractor * text summary extractor --- bsie/extractor/text/summary.py | 79 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 bsie/extractor/text/summary.py (limited to 'bsie/extractor/text/summary.py') diff --git a/bsie/extractor/text/summary.py b/bsie/extractor/text/summary.py new file mode 100644 index 0000000..cc8d90d --- /dev/null +++ b/bsie/extractor/text/summary.py @@ -0,0 +1,79 @@ + +# standard imports +import typing + +# external imports +import transformers + +# bsie imports +from bsie.extractor import base +from bsie.matcher import nodes +from bsie.utils import bsfs, errors, ns + +# exports +__all__: typing.Sequence[str] = ( + 'Language', + ) + + +## code ## + +class Summary(base.Extractor): + """Extract a text summary. + + Uses the following summarization model: + https://huggingface.co/Joemgu/mlong-t5-large-sumstew + + """ + + CONTENT_READER = 'bsie.reader.document.Document' + + _predicate: bsfs.schema.Predicate + + _summarizer: transformers.pipelines.text2text_generation.SummarizationPipeline + + def __init__( + self, + max_length: int = 1024, # summary length in tokens + num_beams: int = 4, # higher = better, but uses more memory + length_penalty: float = 1.0, # higher = longer summaries + ): + super().__init__(bsfs.schema.from_string(base.SCHEMA_PREAMBLE + ''' + bse:summary rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Entity ; + rdfs:range xsd:string ; + bsfs:unique "true"^^xsd:boolean . + ''')) + self._predicate = self.schema.predicate(ns.bse.summary) + self._generator_kwargs = dict( + max_length=max_length, + num_beams=num_beams, + length_penalty=length_penalty, + ) + self._summarizer = transformers.pipeline( + "summarization", + model="joemgu/mlong-t5-large-sumstew", + ) + + def extract( + self, + subject: nodes.Entity, + content: typing.Sequence[str], + principals: typing.Iterable[bsfs.schema.Predicate], + ) -> typing.Iterator[typing.Tuple[nodes.Entity, bsfs.schema.Predicate, str]]: + # check predicates + if self._predicate not in principals: + return + # preprocess + text = '\n'.join(content) + # generate summary + summaries = self._summarizer(text, **self._generator_kwargs) + if len(summaries) == 0: + return + # fetch summary, ignore title + prefix = 'Summary: ' + title_and_summary = summaries[0]['summary_text'] + summary = title_and_summary[title_and_summary.find(prefix) + len(prefix):] + yield subject, self._predicate, summary + +## EOF ## -- cgit v1.2.3