diff options
Diffstat (limited to 'test/extractor/text/test_metrics.py')
-rw-r--r-- | test/extractor/text/test_metrics.py | 75 |
1 files changed, 75 insertions, 0 deletions
diff --git a/test/extractor/text/test_metrics.py b/test/extractor/text/test_metrics.py new file mode 100644 index 0000000..9cc6a94 --- /dev/null +++ b/test/extractor/text/test_metrics.py @@ -0,0 +1,75 @@ + +# standard imports +import os +import unittest + +# bsie imports +from bsie.extractor import base +from bsie.matcher import nodes +from bsie.reader.document import Document +from bsie.utils import bsfs, ns + +# objects to test +from bsie.extractor.text.metrics import TextMetrics + + +## code ## + +class TestTextMetrics(unittest.TestCase): + + def test_schema(self): + self.assertEqual(TextMetrics().schema, + bsfs.schema.from_string(base.SCHEMA_PREAMBLE + ''' + bse:num_characters rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Entity ; + rdfs:range xsd:integer ; + bsfs:unique "true"^^xsd:boolean . + + bse:num_paragraphs rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Entity ; + rdfs:range xsd:integer ; + bsfs:unique "true"^^xsd:boolean . + + bse:num_words rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Entity ; + rdfs:range xsd:integer ; + bsfs:unique "true"^^xsd:boolean . + + bse:vocabulary_size rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Entity ; + rdfs:range xsd:integer ; + bsfs:unique "true"^^xsd:boolean . + + bse:vocabulary_entropy rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Entity ; + rdfs:range xsd:float ; + bsfs:unique "true"^^xsd:boolean . + ''')) + + def test_extract(self): + # setup + rdr = Document() + ext = TextMetrics() + subject = nodes.Entity(ucid='abc123') + principals = set(ext.principals) + path = os.path.join(os.path.dirname(__file__), 'example-en.txt') + # fetch document + text = rdr(path) + triples = set(ext.extract(subject, text, principals)) + + self.assertSetEqual({(s,p,o) for s,p,o in triples if p.uri != ns.bse.vocabulary_entropy}, { + (subject, ext.schema.predicate(ns.bse.num_characters), 21997), + (subject, ext.schema.predicate(ns.bse.num_paragraphs), 48), + (subject, ext.schema.predicate(ns.bse.num_words), 4234), + (subject, ext.schema.predicate(ns.bse.vocabulary_size), 3510), + }) + entropy = {o for s,p,o in triples if p.uri == ns.bse.vocabulary_entropy} + self.assertEqual(len(entropy), 1) + self.assertAlmostEqual(list(entropy)[0], 8.830360505) + +## main ## + +if __name__ == '__main__': + unittest.main() + +## EOF ## |