# standard imports from collections import Counter import math import typing # bsie imports from bsie.extractor import base from bsie.matcher import nodes from bsie.utils import bsfs, ns # exports __all__: typing.Sequence[str] = ( 'TextMetrics', ) ## code ## log2 = lambda x: math.log(x) / math.log(2) class TextMetrics(base.Extractor): """Extract text metrics (character, word, and line counts) from a document.""" CONTENT_READER = 'bsie.reader.document.Document' _callmap: typing.Dict[bsfs.schema.Predicate, typing.Callable[[str], typing.Any]] def __init__(self): super().__init__(bsfs.schema.from_string(base.SCHEMA_PREAMBLE + ''' bse:num_characters rdfs:subClassOf bsfs:Predicate ; rdfs:domain bsn:Entity ; rdfs:range xsd:integer ; bsfs:unique "true"^^xsd:boolean . bse:num_paragraphs rdfs:subClassOf bsfs:Predicate ; rdfs:domain bsn:Entity ; rdfs:range xsd:integer ; bsfs:unique "true"^^xsd:boolean . bse:num_words rdfs:subClassOf bsfs:Predicate ; rdfs:domain bsn:Entity ; rdfs:range xsd:integer ; bsfs:unique "true"^^xsd:boolean . bse:vocabulary_size rdfs:subClassOf bsfs:Predicate ; rdfs:domain bsn:Entity ; rdfs:range xsd:integer ; bsfs:unique "true"^^xsd:boolean . bse:vocabulary_entropy rdfs:subClassOf bsfs:Predicate ; rdfs:domain bsn:Entity ; rdfs:range xsd:float ; bsfs:unique "true"^^xsd:boolean . ''')) self._callmap = { self.schema.predicate(ns.bse.num_characters): self.__num_characters, self.schema.predicate(ns.bse.num_paragraphs): self.__num_paragraphs, self.schema.predicate(ns.bse.num_words): self.__num_words, self.schema.predicate(ns.bse.vocabulary_size): self.__vocab_size, self.schema.predicate(ns.bse.vocabulary_entropy): self.__entropy, } def extract( self, subject: nodes.Entity, content: typing.Sequence[str], principals: typing.Iterable[bsfs.schema.Predicate], ) -> typing.Iterator[typing.Tuple[nodes.Entity, bsfs.schema.Predicate, typing.Any]]: for pred in principals: # find callback clbk = self._callmap.get(pred) if clbk is None: continue # produce triple yield subject, pred, clbk(content) def __num_words(self, text: typing.Sequence[str]) -> int: return sum([len(paragraph.split()) for paragraph in text]) def __num_characters(self, text: typing.Sequence[str]) -> int: return sum([len(paragraph) for paragraph in text]) def __num_paragraphs(self, text: typing.Sequence[str]) -> int: return len(text) def __vocab_size(self, text: typing.Sequence[str]) -> int: return sum({len(paragraph.split()) for paragraph in text}) def __entropy(self, text: typing.Sequence[str]) -> float: words = [word for paragraph in text for word in paragraph.split() ] word_histogram = Counter(words) num_words = len(words) return -sum( word_prob / num_words * log2(word_prob / num_words) for word_prob in word_histogram.values() ) ## EOF ##