minor style and text fixesdevelop

author: Matthias Baumgartner <dev@igsor.net> 2023-07-28 12:00:11 +0200
committer: Matthias Baumgartner <dev@igsor.net> 2023-07-28 12:00:11 +0200
commit: cf032db8785149689d94232b400e20e4d6336562 (patch)
tree: c912f62227b06430bf4c11a820c0a4b34b46512c
parent: 11b26a913d39edb7f36cd0a3b3d8e74c96738579 (diff)
download: bsie-cf032db8785149689d94232b400e20e4d6336562.tar.gz
bsie-cf032db8785149689d94232b400e20e4d6336562.tar.bz2
bsie-cf032db8785149689d94232b400e20e4d6336562.zip
7 files changed, 39 insertions, 21 deletions
diff --git a/bsie/extractor/text/metrics.py b/bsie/extractor/text/metrics.py
index ddb943f..91e0e22 100644
--- a/bsie/extractor/text/metrics.py
+++ b/bsie/extractor/text/metrics.py
@@ -17,14 +17,16 @@ __all__: typing.Sequence[str] = (
 
 ## code ##
 
-log2 = lambda x: math.log(x) / math.log(2)
+def log2(value: float) -> float:
+    """Base 2 logarithm."""
+    return math.log(value) / math.log(2)
 
 class TextMetrics(base.Extractor):
     """Extract text metrics (character, word, and line counts) from a document."""
 
     CONTENT_READER = 'bsie.reader.document.Document'
 
-    _callmap: typing.Dict[bsfs.schema.Predicate, typing.Callable[[str], typing.Any]]
+    _callmap: typing.Dict[bsfs.schema.Predicate, typing.Callable[[typing.Sequence[str]], typing.Any]]
 
     def __init__(self):
         super().__init__(bsfs.schema.from_string(base.SCHEMA_PREAMBLE + '''
@@ -66,7 +68,7 @@ class TextMetrics(base.Extractor):
             subject: nodes.Entity,
             content: typing.Sequence[str],
             principals: typing.Iterable[bsfs.schema.Predicate],
-            ) -> typing.Iterator[typing.Tuple[nodes.Entity, bsfs.schema.Predicate, typing.Any]]:
+            ) -> typing.Iterator[typing.Tuple[nodes.Node, bsfs.schema.Predicate, typing.Any]]:
         for pred in principals:
             # find callback
             clbk = self._callmap.get(pred)
@@ -76,10 +78,10 @@ class TextMetrics(base.Extractor):
             yield subject, pred, clbk(content)
 
     def __num_words(self, text: typing.Sequence[str]) -> int:
-        return sum([len(paragraph.split()) for paragraph in text])
+        return sum(len(paragraph.split()) for paragraph in text)
 
     def __num_characters(self, text: typing.Sequence[str]) -> int:
-        return sum([len(paragraph) for paragraph in text])
+        return sum(len(paragraph) for paragraph in text)
 
     def __num_paragraphs(self, text: typing.Sequence[str]) -> int:
         return len(text)
diff --git a/bsie/extractor/text/summary.py b/bsie/extractor/text/summary.py
index cc8d90d..2c9efef 100644
--- a/bsie/extractor/text/summary.py
+++ b/bsie/extractor/text/summary.py
@@ -8,11 +8,11 @@ import transformers
 # bsie imports
 from bsie.extractor import base
 from bsie.matcher import nodes
-from bsie.utils import bsfs, errors, ns
+from bsie.utils import bsfs, ns
 
 # exports
 __all__: typing.Sequence[str] = (
-    'Language',
+    'Summary',
     )
 
 
@@ -51,8 +51,8 @@ class Summary(base.Extractor):
             length_penalty=length_penalty,
             )
         self._summarizer = transformers.pipeline(
-            "summarization",
-            model="joemgu/mlong-t5-large-sumstew",
+            'summarization',
+            model='joemgu/mlong-t5-large-sumstew',
             )
 
     def extract(
@@ -60,17 +60,17 @@ class Summary(base.Extractor):
             subject: nodes.Entity,
             content: typing.Sequence[str],
             principals: typing.Iterable[bsfs.schema.Predicate],
-            ) -> typing.Iterator[typing.Tuple[nodes.Entity, bsfs.schema.Predicate, str]]:
+            ) -> typing.Iterator[typing.Tuple[nodes.Node, bsfs.schema.Predicate, str]]:
         # check predicates
         if self._predicate not in principals:
             return
         # preprocess
-        text = '\n'.join(content)
-        # generate summary
-        summaries = self._summarizer(text, **self._generator_kwargs)
-        if len(summaries) == 0:
+        text = '\n'.join(content).strip()
+        if len(text) == 0:
             return
-        # fetch summary, ignore title
+        # fetch summary
+        summaries = self._summarizer(text, **self._generator_kwargs)
+        assert len(summaries) == 1
         prefix = 'Summary: '
         title_and_summary = summaries[0]['summary_text']
         summary = title_and_summary[title_and_summary.find(prefix) + len(prefix):]
diff --git a/bsie/reader/document/__init__.py b/bsie/reader/document/__init__.py
index 4ae3613..824ad86 100644
--- a/bsie/reader/document/__init__.py
+++ b/bsie/reader/document/__init__.py
@@ -17,13 +17,13 @@ _FILE_FORMAT_READERS: typing.Sequence[str] = (
 
 # exports
 __all__: typing.Sequence[str] = (
-    'Document'
+    'Document',
     )
 
 
 ## code ##
 
-class Document(chain.ReaderChain[typing.Sequence[str]]):
+class Document(chain.ReaderChain[typing.Sequence[str]]): # pylint: disable=too-few-public-methods
     """Read paragraphs from a text file."""
 
     def __init__(self, cfg: typing.Optional[typing.Any] = None):
diff --git a/bsie/reader/document/_plain.py b/bsie/reader/document/_plain.py
index a589265..8ea3c43 100644
--- a/bsie/reader/document/_plain.py
+++ b/bsie/reader/document/_plain.py
@@ -20,6 +20,7 @@ __all__: typing.Sequence[str] = (
 ## code ##
 
 class Plain(base.Reader):
+    """Read paragraphs (seperated by newline) from a plain text file."""
 
     _match: filematcher.Matcher
 
@@ -32,7 +33,7 @@ class Plain(base.Reader):
             raise errors.UnsupportedFileFormatError(path)
 
         # open file in text mode
-        with open(path, 'rt') as ifile:
+        with open(path, 'rt', encoding='UTF-8') as ifile:
             return [line.strip() for line in ifile.read().split('\n') if len(line.strip()) > 0]
 
 ## EOF ##
diff --git a/test/extractor/text/test_metrics.py b/test/extractor/text/test_metrics.py
index 9cc6a94..6d87889 100644
--- a/test/extractor/text/test_metrics.py
+++ b/test/extractor/text/test_metrics.py
@@ -55,8 +55,8 @@ class TestTextMetrics(unittest.TestCase):
         path = os.path.join(os.path.dirname(__file__), 'example-en.txt')
         # fetch document
         text = rdr(path)
+        # extracts all specified predicates
         triples = set(ext.extract(subject, text, principals))
-
         self.assertSetEqual({(s,p,o) for s,p,o in triples if p.uri != ns.bse.vocabulary_entropy}, {
             (subject, ext.schema.predicate(ns.bse.num_characters), 21997),
             (subject, ext.schema.predicate(ns.bse.num_paragraphs), 48),
@@ -66,6 +66,9 @@ class TestTextMetrics(unittest.TestCase):
         entropy = {o for s,p,o in triples if p.uri == ns.bse.vocabulary_entropy}
         self.assertEqual(len(entropy), 1)
         self.assertAlmostEqual(list(entropy)[0], 8.830360505)
+        # skip unknown predicates
+        self.assertSetEqual(set(), set(ext.extract(subject, text,
+            {ext.schema.predicate(ns.bsfs.Predicate).child(ns.bse.unknown)})))
 
 ## main ##
 
diff --git a/test/extractor/text/test_summary.py b/test/extractor/text/test_summary.py
index 78d3002..aee1ee2 100644
--- a/test/extractor/text/test_summary.py
+++ b/test/extractor/text/test_summary.py
@@ -36,14 +36,20 @@ class TestTextMetrics(unittest.TestCase):
         path = os.path.join(os.path.dirname(__file__), 'example-en.txt')
         # fetch document
         text = rdr(path)
-
+        # empty input yields no triples
+        self.assertEqual(list(ext.extract(subject, [], principals)), [])
+        self.assertEqual(list(ext.extract(subject, [' '], principals)), [])
+        self.assertEqual(list(ext.extract(subject, [' ', ' ', ' '], principals)), [])
+        # creates a summary
         with warnings.catch_warnings():
             warnings.simplefilter('ignore', category=FutureWarning)
             triples = list(ext.extract(subject, text, principals))
-
         self.assertEqual(triples, [
             (subject, ext.schema.predicate(ns.bse.summary),
                 'Alice is tired of sitting by her sister on the bank')])
+        # skip unknown predicates
+        self.assertSetEqual(set(), set(ext.extract(subject, text,
+            {ext.schema.predicate(ns.bsfs.Predicate).child(ns.bse.unknown)})))
 
 ## main ##
 
diff --git a/test/reader/document/test_plain.py b/test/reader/document/test_plain.py
index c63fb30..6bbf8c6 100644
--- a/test/reader/document/test_plain.py
+++ b/test/reader/document/test_plain.py
@@ -3,6 +3,9 @@
 import os
 import unittest
 
+# bsie imports
+from bsie.utils import errors
+
 # objects to test
 from bsie.reader.document._plain import Plain
 
@@ -29,6 +32,9 @@ class TestPlain(unittest.TestCase):
 			'Semper eget duis at tellus at. Neque egestas congue quisque egestas diam in arcu cursus euismod. Erat nam at lectus urna duis convallis convallis. Tempus urna et pharetra pharetra massa massa ultricies mi quis. Magna eget est lorem ipsum dolor sit amet consectetur.',
 			'Orci sagittis eu volutpat odio. Risus pretium quam vulputate dignissim suspendisse in. Volutpat est velit egestas dui id. Massa placerat duis ultricies lacus sed turpis. In nulla posuere sollicitudin aliquam ultrices sagittis orci a. Vel elit scelerisque mauris pellentesque pulvinar pellentesque.',
             ])
+        # ignores unknown files
+        self.assertRaises(errors.UnsupportedFileFormatError, rdr,
+            os.path.join(os.path.dirname(__file__), '..', 'testimage_exif.jpg'))
 
 
 ## main ##
author	Matthias Baumgartner <dev@igsor.net>	2023-07-28 12:00:11 +0200
committer	Matthias Baumgartner <dev@igsor.net>	2023-07-28 12:00:11 +0200
commit	cf032db8785149689d94232b400e20e4d6336562 (patch)
tree	c912f62227b06430bf4c11a820c0a4b34b46512c
parent	11b26a913d39edb7f36cd0a3b3d8e74c96738579 (diff)
download	bsie-cf032db8785149689d94232b400e20e4d6336562.tar.gz bsie-cf032db8785149689d94232b400e20e4d6336562.tar.bz2 bsie-cf032db8785149689d94232b400e20e4d6336562.zip