parsing early port

author: Matthias Baumgartner <dev@igsor.net> 2023-01-08 17:21:19 +0100
committer: Matthias Baumgartner <dev@igsor.net> 2023-01-08 17:21:19 +0100
commit: 06904269d867ed7c8355c0615473baf524fcf30b (patch)
tree: e94ecf79bd1c7f2738d0f031675303ad1ee20e9e /tagit
parent: 7cf03c4de5244e7db3f44362a275f92368fd86ac (diff)
download: tagit-06904269d867ed7c8355c0615473baf524fcf30b.tar.gz
tagit-06904269d867ed7c8355c0615473baf524fcf30b.tar.bz2
tagit-06904269d867ed7c8355c0615473baf524fcf30b.zip
5 files changed, 1196 insertions, 8 deletions
diff --git a/tagit/parsing/__init__.py b/tagit/parsing/__init__.py
new file mode 100644
index 0000000..1c431a4
--- /dev/null
+++ b/tagit/parsing/__init__.py
@@ -0,0 +1,19 @@
+"""
+
+Part of the tagit module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# inner-module imports
+from .datefmt import parse_datetime
+from .search import ast_from_string
+from .sort import sort_from_string
+
+# exports
+__all__ = (
+    'ast_from_string',
+    'parse_datetime',
+    'sort_from_string',
+    )
+
+## EOF ##
diff --git a/tagit/parsing/datefmt.py b/tagit/parsing/datefmt.py
new file mode 100644
index 0000000..49de1c0
--- /dev/null
+++ b/tagit/parsing/datefmt.py
@@ -0,0 +1,568 @@
+"""Parse and interpret date strings.
+
+Consider the following date notations (DMY=04.11.2012):
+
+DMY     04.11.12        europe
+YMD     12.11.04        iso
+MDY     11.04.12        US
+YDM     12.04.11        reverse US
+DYM     04.12.11        too uncommon, ignored
+MYD     11.12.04        too uncommon, ignored
+
+There's the general problem of ambiguity between the DMY and MDY formats.
+Here, we give precedence to the DMY format.
+
+Note that the MDY format can still be used in unambiguous settings or
+with the month spelled out, e.g. "2012, 23th of November"
+
+Similarly, consider the following shortened date notations:
+
+DM      04.11           europe, current year
+MY      11.12           quarters
+YM      12.11           quarters
+MD      11.04           us, current year
+DY      23.12           too uncommon, ignored
+YD      12.23           too uncommon, ignored
+
+In addition to the different spellings, month names can be spelled out
+and the string can be cluttered with additional common words.
+
+Part of the tagit module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# standard imports
+from collections import Counter
+from datetime import date as ddate, time as dtime, datetime, timedelta
+from math import floor
+
+# external imports
+from dateutil.relativedelta import relativedelta
+from pyparsing import Combine, Group, Literal, Optional, Or, Word, nums, oneOf, ParseException
+
+# tagit imports
+from tagit.utils import errors, Struct, flatten
+
+# exports
+__all__ = (
+        # default format strings
+        'DATE_FMT',
+        'TIME_FMT',
+        'DATETIME_FMT',
+        # exceptions
+        'DateParserError',
+        'TimeParserError',
+        'DateFormatError'
+        # parsing
+        'parse_datetime',
+        'guess_datetime',
+        # postprocessing
+        'increment',
+        )
+
+## constants ##
+"""Default strftime format strings."""
+DATE_FMT = '%d.%m.%Y'
+TIME_FMT = '%H:%M'
+DATETIME_FMT = '%d.%m.%Y, %H:%M'
+
+# Literal months
+MONTH_LIT = {
+    'Jan'       : 1,
+    'January'   : 1,
+    'Feb'       : 2,
+    'February'  : 2,
+    'Mar'       : 3,
+    'March'     : 3,
+    'Apr'       : 4,
+    'April'     : 4,
+    'May'       : 5,
+    'Jun'       : 6,
+    'June'      : 6,
+    'Jul'       : 7,
+    'July'      : 7,
+    'Aug'       : 8,
+    'August'    : 8,
+    'Sep'       : 9,
+    'September' : 9,
+    'Oct'       : 10,
+    'October'   : 10,
+    'Nov'       : 11,
+    'November'  : 11,
+    'Dec'       : 12,
+    'December'  : 12,
+    }
+
+
+## code ##
+
+class DatefmtError(errors.ParserError): pass
+
+class DateParserError(DatefmtError): pass
+
+class TimeParserError(DatefmtError): pass
+
+class DateFormatError(DatefmtError): pass
+
+class DF(str):
+    """date/time user-supplied format."""
+    # indicator characters, highest to lowest.
+    _valid_chars = "YMDhmsn"
+    # explicit mapping from unit to character
+    year                = 'Y'
+    month               = 'M'
+    day                 = 'D'
+    hour                = 'h'
+    minute              = 'm'
+    second              = 's'
+    microsecond         = 'n'
+
+    def valid(self):
+        return len(self) and len(set(self._valid_chars) & set(self))
+
+    def lsb(self):
+        """Smallest unit specified."""
+        if not self.valid():
+            raise DateFormatError(
+                'An empty date format string has no least significant position.', self)
+
+        return [i for i in self._valid_chars if i in self][-1]
+
+    def msb(self):
+        """Highest unit specified."""
+        if not self.valid():
+            raise DateFormatError(
+                'An empty date format string has no most significant position.', self)
+
+        return [i for i in self._valid_chars if i in self][0]
+
+    def is_time(self):
+        """Return true if only a time (hour/minute/second/ms) was specified."""
+        return True if self.valid() and self.msb() not in 'YMD' else False
+
+    def is_date(self):
+        """Return true if only a date (year/month/day) was specified."""
+        return True if self.valid() and self.lsb() not in 'hmsn' else False
+
+# priorities
+PRIORITIES_INT = {
+    'p2': [
+        DF(DF.day + DF.month),  # DM
+        DF(DF.month + DF.year), # MY
+        DF(DF.year + DF.month), # YM
+        DF(DF.month + DF.day),  # MD
+        DF(DF.day + DF.year),   # DY
+        DF(DF.year + DF.day),   # YD
+        ],
+    'p3': [
+        DF(DF.day + DF.month + DF.year), # DMY
+        DF(DF.year + DF.month + DF.day), # YMD
+        DF(DF.month + DF.day + DF.year), # MDY
+        DF(DF.year + DF.day + DF.month), # YDM
+        DF(DF.day + DF.year + DF.month), # DYM
+        DF(DF.month + DF.year + DF.day), # MYD
+        ]
+    }
+
+PRIORITIES_US = {
+    'p2': [
+        DF(DF.month + DF.day),
+        DF(DF.year + DF.month),
+        DF(DF.day + DF.month),
+        DF(DF.month + DF.year),
+        DF(DF.day + DF.year),
+        DF(DF.year + DF.day),
+        ],
+    'p3': [
+        DF(DF.month + DF.day + DF.year),
+        DF(DF.year + DF.day + DF.month),
+        DF(DF.day + DF.month + DF.year),
+        DF(DF.year + DF.month + DF.day),
+        DF(DF.day + DF.year + DF.month),
+        DF(DF.month + DF.year + DF.day),
+        ]
+    }
+
+def guess_date(tokens, priorities=None):
+    """Guess the date from a string in an unknown format.
+
+    The method uses the following clues to guess the meaning of each part:
+    * 4-digits implies it's a year
+    * 1-digit discards year (since it's more common to write 04 instead of 4 as a shorthand to 2004
+    * Literal month
+    * 'of' is preceeded by day and succeeded by the month
+    * Any of (st, nd, rd, th) on a number makes it a day
+    * Number > 12 can't be a month
+    * Number > 31 can't be a day
+    * Date inexistence (e.g. 29.02.2018)
+    * precedence DMY > YMD > MDY > YDM
+    * precedence DM > MY > YM > MD
+    """
+    priorities = PRIORITIES_INT if priorities is None else priorities
+
+    # We need to figure out which token corresponds to which component
+    # (D, M, Y). Since this is ambiguous, guesswork is needed. We do so
+    # by eliminating impossible options.
+
+    # initially, all three components are viable
+    guesses = [Struct(tok=tok.strip(), fmt=DF.year + DF.month + DF.day) for tok in tokens]
+
+    # check indicators for specific formats
+    for idx in range(len(guesses)):
+        tok, options = guesses[idx].tok, guesses[idx].fmt
+
+        if len(tok) == 1 and tok in '.,;':
+            # delimiter tokens can be ignored
+            guesses[idx].fmt = ''
+        elif tok == 'of':
+            # an 'of' token indicates a 'day of month' structure
+            guesses[idx-1].fmt = DF.day
+            guesses[idx+1].fmt = DF.month
+            guesses[idx].fmt = ''
+        elif tok[-2:] in ('st', 'nd', 'rd', 'th'):
+            # suffix indicates a day
+            guesses[idx].fmt = DF.day
+            guesses[idx].tok = tok[:-2]
+        elif len(tok) == 4 and tok.isdigit():
+            # four digits must be a year
+            guesses[idx].fmt = DF.year
+        elif tok in MONTH_LIT:
+            # spelled out month is - tadaaa - a month
+            guesses[idx].tok = str(MONTH_LIT[tok])
+            guesses[idx].fmt = DF.month
+
+    # remove jitter (of, delimiters)
+    guesses = [itm for itm in guesses if len(itm.fmt) > 0]
+
+    # eliminate impossible options
+    for idx in range(len(guesses)):
+        tok, options = guesses[idx].tok, guesses[idx].fmt
+
+        if len(tok) == 1:
+            # can't be a year
+            guesses[idx].fmt = guesses[idx].fmt.replace(DF.year, '')
+        if tok.isdigit() and int(tok) > 12:
+            # can't be a month
+            guesses[idx].fmt = guesses[idx].fmt.replace(DF.month, '')
+        if tok.isdigit() and int(tok) > 31:
+            # can't be a day
+            guesses[idx].fmt = guesses[idx].fmt.replace(DF.day, '')
+
+    # define helper function
+    def create_date(year, month, day):
+        """Return a datetime for the given components or None if that is not possible."""
+        # check format
+        if DF.year not in year.fmt or DF.month not in month.fmt or DF.day not in day.fmt:
+            return None
+
+        if len(str(year.tok)) == 2:
+            # ten years into the future is still the current century, otherwise the previous one
+            threshold = ddate.today().year + 10 - 2000
+            year = Struct(
+                    tok='20'+str(year.tok) if int(year.tok) < threshold else '19'+str(year.tok),
+                    fmt=year.fmt
+                    )
+
+        try:
+            # create date
+            return ddate(year=int(year.tok), month=int(month.tok), day=int(day.tok))
+        except ValueError:
+            return None
+
+    # placeholders for unspecified tokens
+    pyear = Struct(tok=ddate.today().year, fmt=DF.year)
+    pday = Struct(tok=1, fmt=DF.day)
+    pmon = Struct(tok=1, fmt=DF.month)
+
+    if len(guesses) == 1: # one-part date (Y)
+        itm = guesses[0]
+        date = create_date(itm, pmon, pday)
+        if date is not None:
+            return date, DF(DF.year)
+        else:
+            raise DateParserError('Two-digit date format must contain the year')
+
+    elif len(guesses) == 2: # two-part date (DM, MY, YM, MD)
+        fst, snd = guesses
+        # check components
+        if len(set(fst.fmt + snd.fmt)) < 2:
+            raise DateParserError('Invalid two-digit date format')
+
+        if len(fst.fmt) == 1 and len(snd.fmt) == 1: # fully determined
+            date = {
+                DF.year: pyear,
+                DF.month: pmon,
+                DF.day: pday,
+                }
+            date.update({
+                fst.fmt: fst,
+                snd.fmt: snd,
+                })
+            return create_date(date[DF.year], date[DF.month], date[DF.day]), DF(fst.fmt + snd.fmt)
+
+        # walk through prioritized formats
+        formats = {
+            DF(DF.day + DF.month):  create_date(pyear, snd, fst), # DM
+            DF(DF.month + DF.year): create_date(snd, fst, pday),  # MY
+            DF(DF.year + DF.month): create_date(fst, snd, pday),  # YM
+            DF(DF.month + DF.day):  create_date(pyear, fst, snd), # MD
+            DF(DF.day + DF.year):   create_date(snd, pmon, fst),  # DY
+            DF(DF.year + DF.day):   create_date(fst, pmon, snd),  # YD
+        }
+
+        for fmt in priorities['p2']:
+            if formats.get(fmt, None) is not None:
+                return formats[fmt], fmt
+
+        raise DateParserError('Cannot guess roles of a two-digit date format')
+
+    elif len(guesses) == 3: # three-part date (DMY, YMD, MDY, YMD)
+
+        # eliminate options based on uniqueness of component assignment
+        changed = True
+        while changed:
+            # resolved guesses: item has only one possible component option
+            resolved = set([itm.fmt for itm in guesses if len(itm.fmt) == 1])
+            # single choice: component has only one possible position
+            unique = {comp for comp, freq in
+                      Counter(flatten([set(itm.fmt) for itm in guesses])).items()
+                      if freq == 1}
+            # assume no changes
+            changed = False
+            for itm in guesses:
+                if unique & set(itm.fmt) and not set(itm.fmt).issubset(unique):
+                    # itm is the only option for one component
+                    itm.fmt = DF(''.join(unique & set(itm.fmt)))
+                    changed = True
+                elif resolved & set(itm.fmt) and not set(itm.fmt).issubset(resolved):
+                    # itm contains options that already taken by a different item
+                    itm.fmt = itm.fmt.translate(str.maketrans('', '', ''.join(resolved)))
+                    changed = True
+
+        fst, snd, trd = guesses
+
+        # check components
+        if len(set(fst.fmt + snd.fmt + trd.fmt)) < 3:
+            raise DateParserError('Invalid three-digit date format')
+
+        if len(fst.fmt) == 1 and len(snd.fmt) == 1 and len(trd.fmt) == 1: # fully determined
+            date = {
+                fst.fmt: fst,
+                snd.fmt: snd,
+                trd.fmt: trd,
+                }
+            return (create_date(date[DF.year], date[DF.month], date[DF.day]),
+                    DF(fst.fmt + snd.fmt + trd.fmt))
+
+        # walk through prioritized formats
+        formats = {
+            DF(DF.day + DF.month + DF.year): create_date(year=trd, month=snd, day=fst), # DMY
+            DF(DF.year + DF.month + DF.day): create_date(year=fst, month=snd, day=trd), # YMD
+            DF(DF.month + DF.day + DF.year): create_date(year=trd, month=fst, day=snd), # MDY
+            DF(DF.year + DF.day + DF.month): create_date(year=fst, month=trd, day=snd), # YDM
+            DF(DF.day + DF.year + DF.month): create_date(year=snd, month=trd, day=fst), # DYM
+            DF(DF.month + DF.year + DF.day): create_date(year=snd, month=fst, day=trd), # MYD
+        }
+
+        for fmt in priorities['p3']:
+            if formats.get(fmt, None) is not None:
+                return formats[fmt], fmt
+
+        raise DateParserError('Cannot guess the roles of a three-digit date format')
+
+    raise DateParserError('Cannot parse the date format')
+
+def guess_time(tokens):
+    """Guess the time from a string in an unknown format.
+
+    * Always sorted from hi (hour) to low (sec)
+    * 4 Terms -> hour, min, sec, ns
+    * 3 Terms -> hour, min, sec
+    * 2 Terms -> hour, min | min, sec
+      * both terms > 24 -> min, sec
+      * am or pm present -> hour, min
+      * Dot separation -> min, sec
+      * Colon separation -> hour, min
+    """
+    # remove spearators
+    tokens = [tok.lower() for tok in tokens if tok not in '.,:']
+    # check if the am/pm format was used
+    is_am = 'am' in tokens
+    is_pm = 'pm' in tokens
+
+    # remove non-numbers
+    tokens = [tok for tok in tokens if tok.isdigit()]
+    if not len(tokens):
+        raise TimeParserError()
+
+    # convert to int
+    ms     = int(tokens[-1].ljust(6, '0'))
+    tokens = [int(tok) for tok in tokens]
+
+    # guess format
+    try:
+        if len(tokens) == 4: # H:M:S.NS
+            tokens[-1] = ms
+            return dtime(*tokens), DF(DF.hour + DF.minute + DF.second + DF.microsecond)
+        elif len(tokens) == 3: # H:M:S
+            return dtime(*tokens), DF(DF.hour + DF.minute + DF.second)
+        elif len(tokens) == 2: # H:M or M:S
+            if is_am: # am/pm notation was used
+                return dtime(*tokens), DF(DF.hour + DF.minute)
+            elif is_pm: # am/pm notation was used
+                return dtime(tokens[0] + 12, tokens[1]), DF(DF.hour + DF.minute)
+            elif tokens[0] > 24: # min, sec
+                return dtime(0, *tokens), DF(DF.minute + DF.second)
+            else: # hour, sec
+                return dtime(*tokens), DF(DF.hour + DF.minute)
+        elif len(tokens) == 1: # H
+            if is_am: # am/pm notation was used
+                return dtime(tokens[0]), DF(DF.hour)
+            elif is_pm: # am/pm notation was used
+                return dtime(tokens[0] + 12), DF(DF.hour)
+            else:
+                return dtime(tokens[0], 0), DF(DF.hour)
+
+    except ValueError:
+        # invalid value was supplied, e.g. hour=85
+        raise TimeParserError('Invalid value', tokens)
+
+    raise TimeParserError('Unknown time format', tokens)
+
+def guess_datetime(exp):
+    """Return a datetime instance by guessing the components of a DATETIME parsed
+    user-supplied date and/or time string. Guessing might be necessary since dates
+    like 10.11.12 are ambiguous. *exp* is supposed to be a pyparsing.ParseResults
+    instance as returned by DATETIME.parseString(...).
+    """
+    # For now I assumed unique separators (dot for date, colon for time, comma to separate the two)
+    if 'date' in exp and 'time' in exp: # both parts present
+        date, dfmt = guess_date(exp.date)
+        time, tfmt = guess_time(exp.time)
+        return datetime.combine(date, time), DF(dfmt+tfmt)
+    elif 'date' in exp: # date present
+        date, dfmt = guess_date(exp.date)
+        return datetime(date.year, date.month, date.day), dfmt
+    elif 'time' in exp: # time present
+        time, tfmt = guess_time(exp.time)
+        return datetime.combine(ddate.fromtimestamp(0), time), tfmt
+    else:
+        raise DateFormatError('Neither a date nor a time was found.')
+
+def increment(date, fmt):
+    """Increment the LSB of a datetime instance by one."""
+    if fmt == '' or not fmt.valid():
+        raise DateFormatError('Invalid date format string', fmt)
+    elif fmt.lsb() == fmt.microsecond: # 5.11.2012, 06:24:18.25 -> 5.11.2012, 06:25:18.26
+        return date + relativedelta(microseconds=1)
+    elif fmt.lsb() == fmt.second: # 5.11.2012, 06:24:18 -> 5.11.2012, 06:25:19
+        return date + relativedelta(seconds=1, microsecond=0)
+    elif fmt.lsb() == fmt.minute: # 5.11.2012, 06:24 -> 5.11.2012, 06:25
+        return date + relativedelta(minutes=1, second=0, microsecond=0)
+    elif fmt.lsb() == fmt.hour: # 5.11.2012, 06am -> 5.11.2012, 07:00
+        return date + relativedelta(hours=1, minute=0, second=0, microsecond=0)
+    elif fmt.lsb() == fmt.day: # 5.11.2012 -> 6.11.2012, 00:00
+        return date + relativedelta(days=1, hour=0, minute=0, second=0, microsecond=0)
+    elif fmt.lsb() == fmt.month: # 11.2012 -> 1.12.2012
+        return date + relativedelta(months=1, day=1, hour=0, minute=0, second=0, microsecond=0)
+    else: # fmt.lsb() == fmt.year: # 2012 -> 1.1.2013, 00:00
+        return date + relativedelta(
+                years=1, month=1, day=1, hour=0, minute=0, second=0, microsecond=0)
+
+class DateTimeParser():
+
+    DATE = None
+    TIME = None
+    DATETIME = None
+
+    def build_parser(self):
+        """
+        DATE        := YMD | DMY | MDY | YDM
+        YMD         := YEAR SEP MON SEP DAY
+        DMY         := DAY SEP [of] MON SEP YEAR
+        MDY         := MON SEP DAY SEP YEAR
+        YDM         := YEAR SEP DAY [of] MON
+        DM          := DAY SEP [of] MON
+        YM          := YEAR SEP MON
+        MY          := MON SEP YEAR
+        MD          := MON SEP DAY
+        DAY         := [D]D | [D]D st | [D]D nd | [D]D rd | [D]D th
+        MON         := [M]M | [month]
+        YEAR        := [YY]YY
+        SEP         := . | , | [whitespace]
+        {D,M,Y}     := [digit]
+        """
+        # FIXME: Allow more patterns (e.g. 2012, 10; April, 5th; April, 2020)
+        sep = Literal('.') # FIXME: Allow '. - :'
+        year = Word(nums, exact=2) ^ Word(nums, exact=4)
+        month = Word(nums, min=1, max=2) ^ oneOf(list(MONTH_LIT.keys()))
+        day = Combine(Word(nums, min=1, max=2) + Optional(oneOf('st nd rd th')))
+        # three-part-date
+        YMD     = year + sep + month + sep + day
+        DMY     = day + (sep ^ 'of') + month + sep + year
+        MDY     = month + sep + day + sep + year
+        YDM     = year + sep + day + (sep ^ 'of') + month
+        # two-part-date
+        DM      = day + (sep ^ 'of')+ month
+        YM      = year + sep + month
+        MY      = month + sep + year
+        MD      = month + sep + day
+        Y       = Word(nums, exact=4)
+        # date parser
+        self.DATE = Group(YMD | DMY | YDM | MDY | DM | YM | MY | MD | Y).setResultsName('date')
+
+        """
+        TIME        := HOUR SEP MIN [SEP SEC [. MS]] | HOUR SEP MIN | HOUR [SEP MIN] {am|pm}
+        HOUR        := [H]H
+        MIN         := [M]M
+        SEC         := [S]S
+        {H,M,S}     := [digit]
+        SEP         := : | . | ,
+        """
+        sep = Literal(':') # FIXME: Allow '. : -'
+        HMS = Word(nums, min=1, max=2)
+        MS = Word(nums, min=1)
+        # time parser
+        self.TIME = Group(HMS + sep + HMS + sep + HMS + oneOf('. :') + MS \
+                | HMS + sep + HMS + sep + HMS \
+                | HMS + Optional(sep + HMS) + oneOf('am pm') \
+                | HMS + sep + HMS ).setResultsName('time')
+
+        """
+        DATETIME    := DATE | TIME | DATE SEP TIME | TIME SEP DATE
+        SEP         := , [whitespace]
+        """
+        self.DATETIME = Group(
+                        self.DATE \
+                        ^ self.TIME \
+                        ^ self.DATE + Optional(',') + self.TIME \
+                        ^ self.TIME + Optional(',') + self.DATE \
+                        ).setResultsName('datetime')
+        return self
+
+    def __call__(self, datestr):
+        if self.DATETIME is None:
+            self.build_parser()
+
+        try:
+            date, fmt = guess_datetime(self.DATETIME.parseString(datestr, parseAll=True)[0])
+            return date
+
+        except ParseException as e:
+            raise errors.ParserError('Cannot parse query', e)
+
+
+"""Default DateTimeParser instance.
+
+To produce an datetime, call
+
+>>> parse_datetime(datestring)
+
+Convenience shortcut for
+
+>>> DateTimeParser().parse(datestring)
+
+"""
+parse_datetime = DateTimeParser().build_parser()
+
+## EOF ##
diff --git a/tagit/parsing/search.py b/tagit/parsing/search.py
new file mode 100644
index 0000000..10d0e7c
--- /dev/null
+++ b/tagit/parsing/search.py
@@ -0,0 +1,405 @@
+"""User-specified search query parsing.
+
+>>> q = "has mime / tag in (november, october) / ! Apfel / time < 10.10.2004 / iso in (100, 200)"
+>>> ast = ast_from_string(q)
+
+Part of the tagit module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# standard imports
+from datetime import datetime
+
+# external imports
+from pyparsing import CaselessKeyword, Combine, Group, Optional, Or, Word, delimitedList, nums, oneOf, ParseException, Literal, QuotedString, alphanums, alphas8bit, punc8bit
+
+# tagit imports
+from tagit.utils import errors, ttime
+
+# inner-module imports
+from . import datefmt
+
+# exports
+__all__ = (
+    'ast_from_string',
+    )
+
+# constants
+SEARCH_DELIM = '/'
+VALUE_DELIM = ','
+DEFAULT_PREDICATE = 'tag'
+
+
+## code ##
+
+class SearchParser():
+
+    # valid predicates per type
+    _PREDICATES_CATEGORICAL = None
+    _PREDICATES_CONTINUOUS  = None
+    _PREDICATES_DATETIME    = None
+
+    # parsers
+    _CATEGORICAL = None
+    _CONTINUOUS = None
+    _EXISTENCE = None
+    _QUERY = None
+    _TAG = None
+
+    def __init__(self, schema: bsfs.schema.Schema):
+        self.schema = schema
+
+    @property
+    def schema(self) -> bsfs.schema.Schema:
+        return self._schema
+
+    @schema.setter
+    def schema(self, schema: bsfs.schema.Schema):
+        self._schema = schema
+        self.build_parser()
+
+    def build_parser(self):
+        """
+        """
+        # The *predicate* argument is for compatibility with predicate listener.
+        # It's not actually used here.
+
+        # valid predicates per type, as supplied by tagit.library
+        # FIXME:
+        # * range / type constraints
+        # * how to filter predicates
+        # * distinguish between prefix and label
+        """
+        Categorical:    string, float, integer; labelled node (tag, group); maybe boolean
+        Continuous:     float, integer
+        Datetime:       datetime
+        Existencial:    all of the above, particularly bool; unllabelled node (preview)
+        -> rfds:range
+
+        > Target:  Entity (allow others?) -> rfds:domain
+        > Require: searchable as specified in backend AND user-searchable as specified in frontend
+        """
+        self._PREDICATES_CATEGORICAL = self.schema.predicates(searchable=True, range=self.schema.tm.categorical) # FIXME!
+        self._PREDICATES_CONTINUOUS  = self.schema.predicates(searchable=True, range=self.schema.tm.numerical)  # FIXME!
+        self._PREDICATES_DATETIME    = self.schema.predicates(searchable=True, range=self.schema.tm.datetime) # FIXME!
+
+        # terminal symbols
+        number = Group(Optional(oneOf('- +')) \
+                    + Combine(Word(nums) + Optional('.' + Optional(Word(nums)))))
+        words = QuotedString(quoteChar='"', escChar='\\') \
+                ^ QuotedString(quoteChar="'", escChar='\\') \
+                ^ Word(alphanums + alphas8bit + punc8bit + ' *#%&-.:;<=>?@^_`{}~')
+        # FIXME: allow escaped chars "( ) / , [ ]"
+        # FIXME: Non-ascii characters
+
+        # predicates
+        predicate = Or([CaselessKeyword(p) for p in self._PREDICATES_CATEGORICAL]).setResultsName(
+            'predicate')
+        date_predicate = Or([CaselessKeyword(p) for p in self._PREDICATES_DATETIME]).setResultsName(
+            'predicate')
+        num_predicate = Or([CaselessKeyword(p) for p in self._PREDICATES_CONTINUOUS]).setResultsName(
+            'predicate')
+
+        # existence
+        """
+        EXPR        := has PREDICATE | has no PREDICATE
+        PREDICATE   := [predicate]
+        """
+        op = (CaselessKeyword('has') ^ CaselessKeyword('has no') ^ CaselessKeyword('has not')).setResultsName('op')
+        self._EXISTENCE = Group(op + predicate).setResultsName('existence')
+
+
+        # continuous
+        """
+        EXPR        := PREDICATE CMP VALUE | VALUE CMP PREDICATE CMP VALUE | PREDICATE OP RANGE
+        PREDICATE   := [predicate]
+        CMP         := < | <= | = | >= | >
+        OP          := : | = | in | not in
+        RANGE       := BOPEN VALUE RSEP VALUE BCLOSE | BOPEN RSEP VALUE BLOSE | BOPEN VALUE RSEP BCLOSE
+        BOPEN       := ( | [ | ]
+        BCLOSE      := ) | ] | [
+        RSEP        := : | -
+        VALUE       := [digit] | [date]
+        """
+        # range style
+        rsepn = oneOf(': -')
+        rsepd = Literal('-')
+        bclose = oneOf(') ] [').setResultsName('bclose')
+        bopen = oneOf('( [ ]').setResultsName('bopen')
+        op = Or([':', '=', 'in']).setResultsName('op')
+        datefmt = datefmt.parse_datetime.DATETIME
+        rngn = num_predicate + op + bopen + number('lo') + rsepn + number('hi') + bclose ^ \
+               num_predicate + op + bopen + rsepn + number('hi') + bclose ^ \
+               num_predicate + op + bopen + number('lo') + rsepn + bclose
+        rngd = date_predicate + op + bopen + datefmt('lo') + rsepd + datefmt('hi') + bclose ^ \
+               date_predicate + op + bopen + rsepd + datefmt('hi') + bclose ^ \
+               date_predicate + op + bopen + datefmt('lo') + rsepd + bclose
+        # equation style
+        cmp = oneOf('< <= = >= >').setResultsName('cmp')
+        eqn = num_predicate + cmp('cright') + number('vright') ^ \
+              number('vleft') + cmp('cleft') + num_predicate ^ \
+              number('vleft') + cmp('cleft') + num_predicate + cmp('cright') + number('vright')
+        eqd = date_predicate + cmp('cright') + datefmt('vright') ^ \
+              datefmt('vleft') + cmp('cleft') + date_predicate ^ \
+              datefmt('vleft') + cmp('cleft') + date_predicate + cmp('cright') + datefmt('vright')
+        # combined
+        self._CONTINUOUS = Group(
+            Group(eqn).setResultsName('eq') ^
+            Group(eqd).setResultsName('eq') ^
+            Group(rngn).setResultsName('range') ^ \
+            Group(rngd).setResultsName('range') \
+            ).setResultsName('continuous')
+
+
+        # categorical
+        """
+        EXPR        := PREDICATE OP VALUE | PREDICATE OP (VALUE)
+        PREDICATE   := [predicate]
+        OP          := : | = | in | not in | != | ~ | !~
+        VALUE       := TERM | VALUE,
+        TERM        := [word]
+        """
+        op = (CaselessKeyword('in') ^ CaselessKeyword('not in') ^ ':' ^ '=' ^ '!=' ^ '~' ^ '!~').setResultsName('op')
+        value = delimitedList(words, delim=VALUE_DELIM).setResultsName('value')
+        self._CATEGORICAL = Group(predicate + op + ('(' + value + ')' | value) ).setResultsName('categorical')
+
+
+        # tag shortcuts
+        """
+        EXPR        := OP VALUE | OP (VALUE) | VALUE | (VALUE)
+        OP          := ! | ~ | !~
+        VALUE       := TERM | VALUE,
+        TERM        := [word]
+        """
+        op = oneOf('! ~ !~').setResultsName('op')
+        value = delimitedList(words, delim=VALUE_DELIM).setResultsName('value')
+        self._TAG = Group(Optional(op) + '(' + value + ')' ^ Optional(op) + value).setResultsName('tag')
+
+
+        # overall query
+        """
+        QUERY       := QUERY / QUERY | EXPR
+        """
+        self._QUERY = delimitedList(self._EXISTENCE | self._CONTINUOUS | self._CATEGORICAL | self._TAG, delim=SEARCH_DELIM)
+        return self
+
+    def __del__(self):
+        if self._QUERY is not None: # remove listener
+            try:
+                self.predicates.ignore(self.build_parser)
+            except ImportError:
+                # The import fails if python is shutting down.
+                # In that case, the ignore becomes unnecessary anyway.
+                pass
+
+    def __call__(self, search):
+        # FIXME: mb/port/parsing
+        #if self._QUERY is None:
+        #    # parsers were not initialized yet
+        #    self.build_parser()
+        #    # attach listener to receive future updates
+        #    self.predicates.listen(self.build_parser)
+        #    # FIXME: Additional filters would be handy
+        #    #self.predicates.listen(self.build_parser, self.predicates.scope.library)
+
+        try:
+            parsed = self._QUERY.parseString(search, parseAll=True)
+        except ParseException as e:
+            raise errors.ParserError('Cannot parse query', e)
+
+        # convert to AST
+        tokens = []
+        for exp in parsed:
+            if exp.getName() == 'existence':
+                if 'op' not in exp: # prevented by grammar
+                    raise errors.ParserError('Missing operator', exp)
+                elif exp.op == 'has':
+                    cond = ast.Existence()
+                elif exp.op in ('has no', 'has not'):
+                    cond = ast.Inexistence()
+                else: # prevented by grammar
+                    raise errors.ParserError('Invalid operator ({})'.format(exp.op), exp)
+
+                tokens.append(
+                    ast.Token(exp.predicate.lower(), cond))
+
+            elif exp.getName() == 'categorical':
+                values = [s.strip() for s in exp.value]
+                if 'op' not in exp: # prevented by grammar
+                    raise errors.ParserError('Missing operator', exp)
+                elif exp.op in (':', '=', 'in'):
+                    cond = ast.SetInclude(values)
+                elif exp.op in ('!=', 'not in'):
+                    cond = ast.SetExclude(values)
+                elif exp.op == '~':
+                    cond = ast.SetInclude(values, approximate=True)
+                elif exp.op == '!~':
+                    cond = ast.SetExclude(values, approximate=True)
+                else: # prevented by grammar
+                    raise errors.ParserError('Invalid operator ({})'.format(exp.op), exp)
+
+                tokens.append(
+                    ast.Token(exp.predicate.lower(), cond))
+
+            elif exp.getName() == 'tag':
+                values = [s.strip() for s in exp.value]
+                if 'op' not in exp:
+                    cond = ast.SetInclude(values)
+                elif exp.op == '~':
+                    cond = ast.SetInclude(values, approximate=True)
+                elif exp.op == '!':
+                    cond = ast.SetExclude(values)
+                elif exp.op == '!~':
+                    cond = ast.SetExclude(values, approximate=True)
+                else: # prevented by grammar
+                    raise errors.ParserError('Invalid operator ({})'.format(exp.op), exp)
+
+                tokens.append(
+                    ast.Token(DEFAULT_PREDICATE, cond))
+
+            elif exp.getName() == 'continuous':
+
+                lo, hi = None, None
+                lo_inc, hi_inc = False, False
+                predicate = None
+
+                if 'eq' in exp:
+                    # equation style
+                    predicate = exp.eq.predicate.lower()
+
+                    if ('>' in exp.eq.cleft and '<' in exp.eq.cright) or \
+                       ('<' in exp.eq.cleft and '>' in exp.eq.cright) or \
+                       (exp.eq.cleft == '=' and exp.eq.cright == '='):
+                        # x > pred < y or x < pred > y or x = pred = y
+                        raise errors.ParserError('Cannot have two lower or two upper bounds', exp)
+
+                    if '>' in exp.eq.cleft:
+                        hi = exp.eq.vleft
+                        hi_inc = '=' in exp.eq.cleft
+                    elif '<' in exp.eq.cleft:
+                        lo = exp.eq.vleft
+                        lo_inc = '=' in exp.eq.cleft
+                    elif exp.eq.cleft == '=':
+                        hi = lo = exp.eq.vleft
+                        lo_inc = hi_inc = True
+
+                    if '>' in exp.eq.cright:
+                        lo = exp.eq.vright
+                        lo_inc = '=' in exp.eq.cright
+                    elif '<' in exp.eq.cright:
+                        hi = exp.eq.vright
+                        hi_inc = '=' in exp.eq.cright
+                    elif exp.eq.cright == '=':
+                        hi = lo = exp.eq.vright
+                        lo_inc = hi_inc = True
+
+                elif 'range' in exp: # value in [lo:hi]
+                    predicate = exp.range.predicate.lower()
+
+                    if 'lo' in exp.range:
+                        lo = exp.range.lo
+                        lo_inc = exp.range.bopen == '['
+                    if 'hi' in exp.range:
+                        hi = exp.range.hi
+                        hi_inc = exp.range.bclose == ']'
+
+                else: # prevented by grammar
+                    raise errors.ParserError('Expression is neither a range nor an equation', exp)
+
+                # interpret values
+                if predicate in set([p.lower() for p in self._PREDICATES_DATETIME]):
+
+                    # turn into datetime
+                    lo, lfmt = datefmt.guess_datetime(lo) if lo is not None else (None, None)
+                    hi, hfmt = datefmt.guess_datetime(hi) if hi is not None else (None, None)
+
+                    if lo is None and hi is None: # prevented by grammar
+                        raise errors.ParserError('At least one bound must be present', exp)
+
+                    # turn the query into the format lo <= pred < hi by adjusting the boundaries
+                    if hi == lo and lo_inc and hi_inc:
+                        # example: pred = 2012 -> 1.1.2012 <= pred < 1.1.2013
+                        hi = datefmt.increment(lo, lfmt)
+                        lo_inc = True
+                        hi_inc = False
+                    else:
+                        if lo is not None:
+                            # example: pred >= 2012 -> pred >= 1.1.2012, 00:00
+                            lo = datefmt.increment(lo, lfmt) if not lo_inc else lo
+                            lo_inc = True
+
+                        if hi is not None:
+                            # example: pred <= 2012 -> pred  <  1.1.2013, 00:00
+                            hi = datefmt.increment(hi, hfmt) if hi_inc else hi
+                            hi_inc = False
+
+                    # build the ast node
+                    if (lo is not None and lfmt.is_time()) or (hi is not None and hfmt.is_time()):
+                        # time specification
+
+                        if (lo is not None and not lfmt.is_time()) or \
+                            (hi is not None and not hfmt.is_time()):
+                            # lo/hi must both be time specifications
+                            raise errors.ParserError('Both bounds must be a time specification', (lo, hi))
+
+                        if lo is None:
+                            # example: pred < 5 am -> 0 <= pred < 05:00
+                            lo = ttime.from_timestamp_loc(0)
+                            lo_inc = True
+
+                        if hi is None:
+                            # example: pred > 5 am -> 06:00 <= pred <= 24:00
+                            hi = ttime.from_timestamp_loc(3600 * 24)
+                            hi_inc = True
+
+                        # Check consistency
+                        if not (lo < hi or (lo == hi and lo_inc and hi_inc)):
+                            raise errors.ParserError('Lower bound must not exceed upper bound', (lo, hi))
+
+                        tokens.append(
+                            ast.Token(predicate, ast.TimeRange(lo, hi, lo_inc, hi_inc)))
+
+                    else: # date specification
+                        # Check consistency
+                        lo = lo if lo is not None else datetime.min
+                        hi = hi if hi is not None else datetime.max
+
+                        if not (lo < hi or (lo == hi and lo_inc and hi_inc)):
+                            raise errors.ParserError('Lower bound must not exceed upper bound', (lo, hi))
+
+                        tokens.append(
+                            ast.Token(predicate, ast.Datetime(lo, hi, lo_inc, hi_inc)))
+
+                else:
+                    # number predicate
+                    lo = float(''.join(lo)) if lo is not None else float('-inf')
+                    hi = float(''.join(hi)) if hi is not None else float('inf')
+
+                    # Check consistency
+                    if not (lo < hi or (lo == hi and lo_inc and hi_inc)):
+                        raise errors.ParserError('Lower bound must not exceed upper bound', (lo, hi))
+
+                    tokens.append(
+                        ast.Token(predicate, ast.Continuous(lo, hi, lo_inc, hi_inc)))
+
+            else: # prevented by grammar
+                raise errors.ParserError('Invalid expression', exp)
+
+        return ast.AND(tokens)
+
+
+
+"""Default SearchParser instance.
+
+To produce an ast, call
+
+>>> ast_from_string(search)
+
+Convenience shortcut for
+
+>>> SearchParser().parse(search)
+
+"""
+ast_from_string = SearchParser(predicates)
+
+## EOF ##
diff --git a/tagit/parsing/sort.py b/tagit/parsing/sort.py
new file mode 100644
index 0000000..8950613
--- /dev/null
+++ b/tagit/parsing/sort.py
@@ -0,0 +1,192 @@
+"""
+
+Part of the tagit module.
+A copy of the license is provided with the project.
+Author: Matthias Baumgartner, 2022
+"""
+# external imports
+from pyparsing import CaselessKeyword, Group, Or, Word, delimitedList, oneOf, ParseException
+
+# tagit imports
+from tagit.utils import errors, Struct
+
+# exports
+__all__ = (
+    'sort_from_string',
+    )
+
+
+## code ##
+
+class SortParser():
+    """Sort parser.
+
+    A sort string can be as simple as a predicate, but also allows
+    a more verbose specification for more natural readability.
+    In brief and somewhat relaxed notation, the syntax is:
+        [sort [<type>] by] <predicate> [similarity to] [<anchor>] [<direction>]
+    Multiple sort terms are concatenated with a comma.
+
+    Examples:
+        time
+        time asc
+        sort by time desc
+        sort numerically by time downwards
+        sort by tag similarity to AF39D281CE3 up
+        time, iso
+    """
+    QUERY = None
+    PREDICATES = None
+
+    def __init__(self, sortkeys):
+        self.sortkeys = sortkeys
+
+    def __call__(self, query):
+        return self.parse(query)
+
+    def build_parser(self, predicate=None):
+        # The *predicate* argument is for compatibility with predicate listener.
+        # It's not actually used here.
+        """
+        The grammar is composed as follows:
+
+        QUERY := EXPR | EXPR, EXPR
+        EXPR := PREFIX PREDICATE SUFFIX | PREDICATE SUFFIX | PREFIX PREDICATE | PREDICATE
+        PREFIX := sort TYPE by | sort by
+        SUFFIX := SIMILAR DIRECTION | SIMILAR | DIRECTION
+        SIMILAR := similarity to ANCHOR | ANCHOR
+        TYPE := numerically | alphabetically
+        PREDICATE := [predicate]
+        ANCHOR := [guid]
+        DIRECTION := up | down | asc | desc | ascending | descending | reversed | upwards | downwards
+        """
+        # predicates from sortkeys
+        self.PREDICATES = self.sortkeys.scope.library | self.sortkeys.typedef.anchored
+
+        ## terminals
+        # direction is just a list of keywords
+        direction = oneOf('up down asc desc ascending descending reversed upwards downwards',
+                        caseless=True).setResultsName('direction')
+        # type is just a list of keywords
+        type_ = oneOf('numerically alphabetically').setResultsName('type')
+        # predicates are from an enum
+        predicate = Or([CaselessKeyword(p) for p in self.PREDICATES]).setResultsName('predicate')
+        # anchor is a hex digest
+        anchor = Word('abcdef0123456789ABCDEF').setResultsName('anchor')
+
+        ## rules
+        similar = Or([CaselessKeyword('similarity to') + anchor,
+                      anchor])
+        suffix = Or([similar + direction, similar,
+                     direction])
+        prefix = Or([CaselessKeyword('sort') + type_ + CaselessKeyword('by'),
+                     CaselessKeyword('sort by')])
+        expr = Group(Or([prefix + predicate + suffix,
+                    predicate + suffix,
+                    prefix + predicate,
+                    predicate]))
+
+        self.QUERY = delimitedList(expr, delim=',')
+        return self
+
+    def __del__(self):
+        if self.QUERY is not None: # remove listener
+            try:
+                self.sortkeys.ignore(self.build_parser)
+            except ImportError:
+                # The import fails if python is shutting down.
+                # In that case, the ignore becomes unnecessary anyway.
+                pass
+
+    def parse(self, sort):
+        if self.QUERY is None:
+            # initialize parser
+            self.build_parser()
+            # attach listener to receive future updates
+            self.sortkeys.listen(self.build_parser)
+
+        try:
+            parsed = self.QUERY.parseString(sort, parseAll=True)
+        except ParseException as e:
+            raise errors.ParserError('Cannot parse query', e)
+
+        # convert to AST
+        tokens = []
+        for exp in parsed:
+            args = Struct(
+                predicate=None,
+                type=None,
+                anchor=None,
+                direction='asc',
+                )
+            args.update(**exp.asDict())
+
+            # check predicate
+            if args.predicate is None: # prevented by grammar
+                raise errors.ParserError('Missing sort key', exp)
+            if args.predicate not in self.sortkeys: # prevented by grammar
+                raise errors.ParserError('Invalid sort key', exp)
+
+            # check direction
+            if args.direction in ('up', 'ascending', 'asc', 'upwards'):
+                reverse = False
+            elif args.direction in ('down', 'desc', 'descending', 'reversed', 'downwards'):
+                reverse = True
+            else: # prevented by grammar
+                raise errors.ParserError('Invalid direction', exp)
+
+            # infer type from predicate if needed
+            if args.anchor is not None:
+                args.type = 'anchored'
+            elif args.type is None:
+                typedef = self.sortkeys.predicate(args.predicate).typedef
+                if not len(typedef):
+                    raise errors.ParserError('Undefined type', exp)
+                elif len(typedef) == 1:
+                    args.type = list(typedef)[0].lower()
+                else:
+                    raise errors.ParserError('Ambiguous type', exp)
+
+            # translate types
+            args.type = {
+                    'numerically': 'numerical',
+                    'alphabetically': 'alphabetical'
+                    }.get(args.type, args.type)
+
+            # check type compatibility
+            admissible_types = {t.lower() for t in self.sortkeys.predicate(args.predicate).typedef}
+            if args.type not in admissible_types:
+                raise errors.ParserError('Invalid type for predicate', exp)
+            elif args.type == 'anchored' and args.anchor is None: # type set if anchor isn't None
+                raise errors.ParserError('No anchor given', exp)
+
+            # build AST
+            if args.type in ('anchored', ):
+                tokens.append(ast.AnchoredSort(args.predicate, args.anchor, reverse))
+            elif args.type in ('alphabetical', 'alphabetically'):
+                tokens.append(ast.AlphabeticalSort(args.predicate, reverse))
+            elif args.type in ('numerical', 'numerically'):
+                tokens.append(ast.NumericalSort(args.predicate, reverse))
+            else: # prevented by grammar
+                raise errors.ParserError('Invalid type for predicate', exp)
+
+        # aggregate if need be
+        if len(tokens) == 1:
+            return tokens[0]
+        else:
+            return ast.Order(*tokens)
+
+"""Default SortParser instance.
+
+To produce an ast, call
+
+>>> sort_from_string(sort)
+
+Convenience shortcut for
+
+>>> SortParser().parse(sort)
+
+"""
+sort_from_string = SortParser(sortkeys)
+
+## EOF ##
diff --git a/tagit/utils/errors.py b/tagit/utils/errors.py
index 1bed670..7a2556e 100644
--- a/tagit/utils/errors.py
+++ b/tagit/utils/errors.py
@@ -2,17 +2,17 @@
 
 Part of the tagit module.
 A copy of the license is provided with the project.
-Author: Matthias Baumgartner, 2018
+Author: Matthias Baumgartner, 2022
 """
 # exports
 __all__ = (
-            'EmptyFileError',
-            'LoaderError',
-            'NotAFileError',
-            'ProgrammingError',
-            'UserError',
-            'abstract',
-          )
+    'EmptyFileError',
+    'LoaderError',
+    'NotAFileError',
+    'ProgrammingError',
+    'UserError',
+    'abstract',
+    )
 
 
 ## code ##
@@ -49,4 +49,8 @@ class ParserBackendError(Exception):
     """Generic parser backend error."""
     pass
 
+class ParserError(Exception):
+    """String parsing failure."""
+    pass
+
 ## EOF ##
author	Matthias Baumgartner <dev@igsor.net>	2023-01-08 17:21:19 +0100
committer	Matthias Baumgartner <dev@igsor.net>	2023-01-08 17:21:19 +0100
commit	06904269d867ed7c8355c0615473baf524fcf30b (patch)
tree	e94ecf79bd1c7f2738d0f031675303ad1ee20e9e /tagit
parent	7cf03c4de5244e7db3f44362a275f92368fd86ac (diff)
download	tagit-06904269d867ed7c8355c0615473baf524fcf30b.tar.gz tagit-06904269d867ed7c8355c0615473baf524fcf30b.tar.bz2 tagit-06904269d867ed7c8355c0615473baf524fcf30b.zip