aboutsummaryrefslogtreecommitdiffstats
path: root/test/extractor/text/test_metrics.py
diff options
context:
space:
mode:
Diffstat (limited to 'test/extractor/text/test_metrics.py')
-rw-r--r--test/extractor/text/test_metrics.py75
1 files changed, 75 insertions, 0 deletions
diff --git a/test/extractor/text/test_metrics.py b/test/extractor/text/test_metrics.py
new file mode 100644
index 0000000..9cc6a94
--- /dev/null
+++ b/test/extractor/text/test_metrics.py
@@ -0,0 +1,75 @@
+
+# standard imports
+import os
+import unittest
+
+# bsie imports
+from bsie.extractor import base
+from bsie.matcher import nodes
+from bsie.reader.document import Document
+from bsie.utils import bsfs, ns
+
+# objects to test
+from bsie.extractor.text.metrics import TextMetrics
+
+
+## code ##
+
+class TestTextMetrics(unittest.TestCase):
+
+ def test_schema(self):
+ self.assertEqual(TextMetrics().schema,
+ bsfs.schema.from_string(base.SCHEMA_PREAMBLE + '''
+ bse:num_characters rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsn:Entity ;
+ rdfs:range xsd:integer ;
+ bsfs:unique "true"^^xsd:boolean .
+
+ bse:num_paragraphs rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsn:Entity ;
+ rdfs:range xsd:integer ;
+ bsfs:unique "true"^^xsd:boolean .
+
+ bse:num_words rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsn:Entity ;
+ rdfs:range xsd:integer ;
+ bsfs:unique "true"^^xsd:boolean .
+
+ bse:vocabulary_size rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsn:Entity ;
+ rdfs:range xsd:integer ;
+ bsfs:unique "true"^^xsd:boolean .
+
+ bse:vocabulary_entropy rdfs:subClassOf bsfs:Predicate ;
+ rdfs:domain bsn:Entity ;
+ rdfs:range xsd:float ;
+ bsfs:unique "true"^^xsd:boolean .
+ '''))
+
+ def test_extract(self):
+ # setup
+ rdr = Document()
+ ext = TextMetrics()
+ subject = nodes.Entity(ucid='abc123')
+ principals = set(ext.principals)
+ path = os.path.join(os.path.dirname(__file__), 'example-en.txt')
+ # fetch document
+ text = rdr(path)
+ triples = set(ext.extract(subject, text, principals))
+
+ self.assertSetEqual({(s,p,o) for s,p,o in triples if p.uri != ns.bse.vocabulary_entropy}, {
+ (subject, ext.schema.predicate(ns.bse.num_characters), 21997),
+ (subject, ext.schema.predicate(ns.bse.num_paragraphs), 48),
+ (subject, ext.schema.predicate(ns.bse.num_words), 4234),
+ (subject, ext.schema.predicate(ns.bse.vocabulary_size), 3510),
+ })
+ entropy = {o for s,p,o in triples if p.uri == ns.bse.vocabulary_entropy}
+ self.assertEqual(len(entropy), 1)
+ self.assertAlmostEqual(list(entropy)[0], 8.830360505)
+
+## main ##
+
+if __name__ == '__main__':
+ unittest.main()
+
+## EOF ##