diff options
Diffstat (limited to 'bsie/extractor/text/metrics.py')
-rw-r--r-- | bsie/extractor/text/metrics.py | 100 |
1 files changed, 100 insertions, 0 deletions
diff --git a/bsie/extractor/text/metrics.py b/bsie/extractor/text/metrics.py new file mode 100644 index 0000000..ddb943f --- /dev/null +++ b/bsie/extractor/text/metrics.py @@ -0,0 +1,100 @@ + +# standard imports +from collections import Counter +import math +import typing + +# bsie imports +from bsie.extractor import base +from bsie.matcher import nodes +from bsie.utils import bsfs, ns + +# exports +__all__: typing.Sequence[str] = ( + 'TextMetrics', + ) + + +## code ## + +log2 = lambda x: math.log(x) / math.log(2) + +class TextMetrics(base.Extractor): + """Extract text metrics (character, word, and line counts) from a document.""" + + CONTENT_READER = 'bsie.reader.document.Document' + + _callmap: typing.Dict[bsfs.schema.Predicate, typing.Callable[[str], typing.Any]] + + def __init__(self): + super().__init__(bsfs.schema.from_string(base.SCHEMA_PREAMBLE + ''' + bse:num_characters rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Entity ; + rdfs:range xsd:integer ; + bsfs:unique "true"^^xsd:boolean . + + bse:num_paragraphs rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Entity ; + rdfs:range xsd:integer ; + bsfs:unique "true"^^xsd:boolean . + + bse:num_words rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Entity ; + rdfs:range xsd:integer ; + bsfs:unique "true"^^xsd:boolean . + + bse:vocabulary_size rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Entity ; + rdfs:range xsd:integer ; + bsfs:unique "true"^^xsd:boolean . + + bse:vocabulary_entropy rdfs:subClassOf bsfs:Predicate ; + rdfs:domain bsn:Entity ; + rdfs:range xsd:float ; + bsfs:unique "true"^^xsd:boolean . + ''')) + self._callmap = { + self.schema.predicate(ns.bse.num_characters): self.__num_characters, + self.schema.predicate(ns.bse.num_paragraphs): self.__num_paragraphs, + self.schema.predicate(ns.bse.num_words): self.__num_words, + self.schema.predicate(ns.bse.vocabulary_size): self.__vocab_size, + self.schema.predicate(ns.bse.vocabulary_entropy): self.__entropy, + } + + def extract( + self, + subject: nodes.Entity, + content: typing.Sequence[str], + principals: typing.Iterable[bsfs.schema.Predicate], + ) -> typing.Iterator[typing.Tuple[nodes.Entity, bsfs.schema.Predicate, typing.Any]]: + for pred in principals: + # find callback + clbk = self._callmap.get(pred) + if clbk is None: + continue + # produce triple + yield subject, pred, clbk(content) + + def __num_words(self, text: typing.Sequence[str]) -> int: + return sum([len(paragraph.split()) for paragraph in text]) + + def __num_characters(self, text: typing.Sequence[str]) -> int: + return sum([len(paragraph) for paragraph in text]) + + def __num_paragraphs(self, text: typing.Sequence[str]) -> int: + return len(text) + + def __vocab_size(self, text: typing.Sequence[str]) -> int: + return sum({len(paragraph.split()) for paragraph in text}) + + def __entropy(self, text: typing.Sequence[str]) -> float: + words = [word for paragraph in text for word in paragraph.split() ] + word_histogram = Counter(words) + num_words = len(words) + return -sum( + word_prob / num_words * log2(word_prob / num_words) + for word_prob + in word_histogram.values() + ) + +## EOF ## |