diff options
Diffstat (limited to 'bsie/extractor')
-rw-r--r-- | bsie/extractor/text/metrics.py | 12 | ||||
-rw-r--r-- | bsie/extractor/text/summary.py | 20 |
2 files changed, 17 insertions, 15 deletions
diff --git a/bsie/extractor/text/metrics.py b/bsie/extractor/text/metrics.py index ddb943f..91e0e22 100644 --- a/bsie/extractor/text/metrics.py +++ b/bsie/extractor/text/metrics.py @@ -17,14 +17,16 @@ __all__: typing.Sequence[str] = ( ## code ## -log2 = lambda x: math.log(x) / math.log(2) +def log2(value: float) -> float: + """Base 2 logarithm.""" + return math.log(value) / math.log(2) class TextMetrics(base.Extractor): """Extract text metrics (character, word, and line counts) from a document.""" CONTENT_READER = 'bsie.reader.document.Document' - _callmap: typing.Dict[bsfs.schema.Predicate, typing.Callable[[str], typing.Any]] + _callmap: typing.Dict[bsfs.schema.Predicate, typing.Callable[[typing.Sequence[str]], typing.Any]] def __init__(self): super().__init__(bsfs.schema.from_string(base.SCHEMA_PREAMBLE + ''' @@ -66,7 +68,7 @@ class TextMetrics(base.Extractor): subject: nodes.Entity, content: typing.Sequence[str], principals: typing.Iterable[bsfs.schema.Predicate], - ) -> typing.Iterator[typing.Tuple[nodes.Entity, bsfs.schema.Predicate, typing.Any]]: + ) -> typing.Iterator[typing.Tuple[nodes.Node, bsfs.schema.Predicate, typing.Any]]: for pred in principals: # find callback clbk = self._callmap.get(pred) @@ -76,10 +78,10 @@ class TextMetrics(base.Extractor): yield subject, pred, clbk(content) def __num_words(self, text: typing.Sequence[str]) -> int: - return sum([len(paragraph.split()) for paragraph in text]) + return sum(len(paragraph.split()) for paragraph in text) def __num_characters(self, text: typing.Sequence[str]) -> int: - return sum([len(paragraph) for paragraph in text]) + return sum(len(paragraph) for paragraph in text) def __num_paragraphs(self, text: typing.Sequence[str]) -> int: return len(text) diff --git a/bsie/extractor/text/summary.py b/bsie/extractor/text/summary.py index cc8d90d..2c9efef 100644 --- a/bsie/extractor/text/summary.py +++ b/bsie/extractor/text/summary.py @@ -8,11 +8,11 @@ import transformers # bsie imports from bsie.extractor import base from bsie.matcher import nodes -from bsie.utils import bsfs, errors, ns +from bsie.utils import bsfs, ns # exports __all__: typing.Sequence[str] = ( - 'Language', + 'Summary', ) @@ -51,8 +51,8 @@ class Summary(base.Extractor): length_penalty=length_penalty, ) self._summarizer = transformers.pipeline( - "summarization", - model="joemgu/mlong-t5-large-sumstew", + 'summarization', + model='joemgu/mlong-t5-large-sumstew', ) def extract( @@ -60,17 +60,17 @@ class Summary(base.Extractor): subject: nodes.Entity, content: typing.Sequence[str], principals: typing.Iterable[bsfs.schema.Predicate], - ) -> typing.Iterator[typing.Tuple[nodes.Entity, bsfs.schema.Predicate, str]]: + ) -> typing.Iterator[typing.Tuple[nodes.Node, bsfs.schema.Predicate, str]]: # check predicates if self._predicate not in principals: return # preprocess - text = '\n'.join(content) - # generate summary - summaries = self._summarizer(text, **self._generator_kwargs) - if len(summaries) == 0: + text = '\n'.join(content).strip() + if len(text) == 0: return - # fetch summary, ignore title + # fetch summary + summaries = self._summarizer(text, **self._generator_kwargs) + assert len(summaries) == 1 prefix = 'Summary: ' title_and_summary = summaries[0]['summary_text'] summary = title_and_summary[title_and_summary.find(prefix) + len(prefix):] |