"""Parse and interpret date strings. Consider the following date notations (DMY=04.11.2012): DMY 04.11.12 europe YMD 12.11.04 iso MDY 11.04.12 US YDM 12.04.11 reverse US DYM 04.12.11 too uncommon, ignored MYD 11.12.04 too uncommon, ignored There's the general problem of ambiguity between the DMY and MDY formats. Here, we give precedence to the DMY format. Note that the MDY format can still be used in unambiguous settings or with the month spelled out, e.g. "2012, 23th of November" Similarly, consider the following shortened date notations: DM 04.11 europe, current year MY 11.12 quarters YM 12.11 quarters MD 11.04 us, current year DY 23.12 too uncommon, ignored YD 12.23 too uncommon, ignored In addition to the different spellings, month names can be spelled out and the string can be cluttered with additional common words. Part of the tagit module. A copy of the license is provided with the project. Author: Matthias Baumgartner, 2022 """ # standard imports from collections import Counter from datetime import date as ddate, time as dtime, datetime, timedelta from math import floor # external imports from dateutil.relativedelta import relativedelta from pyparsing import Combine, Group, Literal, Optional, Or, Word, nums, oneOf, ParseException # tagit imports from tagit.utils import errors, Struct, flatten # exports __all__ = ( # default format strings 'DATE_FMT', 'TIME_FMT', 'DATETIME_FMT', # exceptions 'DateParserError', 'TimeParserError', 'DateFormatError' # parsing 'parse_datetime', 'guess_datetime', # postprocessing 'increment', ) ## constants ## """Default strftime format strings.""" DATE_FMT = '%d.%m.%Y' TIME_FMT = '%H:%M' DATETIME_FMT = '%d.%m.%Y, %H:%M' # Literal months MONTH_LIT = { 'Jan' : 1, 'January' : 1, 'Feb' : 2, 'February' : 2, 'Mar' : 3, 'March' : 3, 'Apr' : 4, 'April' : 4, 'May' : 5, 'Jun' : 6, 'June' : 6, 'Jul' : 7, 'July' : 7, 'Aug' : 8, 'August' : 8, 'Sep' : 9, 'September' : 9, 'Oct' : 10, 'October' : 10, 'Nov' : 11, 'November' : 11, 'Dec' : 12, 'December' : 12, } ## code ## class DatefmtError(errors.ParserError): pass class DateParserError(DatefmtError): pass class TimeParserError(DatefmtError): pass class DateFormatError(DatefmtError): pass class DF(str): """date/time user-supplied format.""" # indicator characters, highest to lowest. _valid_chars = "YMDhmsn" # explicit mapping from unit to character year = 'Y' month = 'M' day = 'D' hour = 'h' minute = 'm' second = 's' microsecond = 'n' def valid(self): return len(self) and len(set(self._valid_chars) & set(self)) def lsb(self): """Smallest unit specified.""" if not self.valid(): raise DateFormatError( 'An empty date format string has no least significant position.', self) return [i for i in self._valid_chars if i in self][-1] def msb(self): """Highest unit specified.""" if not self.valid(): raise DateFormatError( 'An empty date format string has no most significant position.', self) return [i for i in self._valid_chars if i in self][0] def is_time(self): """Return true if only a time (hour/minute/second/ms) was specified.""" return True if self.valid() and self.msb() not in 'YMD' else False def is_date(self): """Return true if only a date (year/month/day) was specified.""" return True if self.valid() and self.lsb() not in 'hmsn' else False # priorities PRIORITIES_INT = { 'p2': [ DF(DF.day + DF.month), # DM DF(DF.month + DF.year), # MY DF(DF.year + DF.month), # YM DF(DF.month + DF.day), # MD DF(DF.day + DF.year), # DY DF(DF.year + DF.day), # YD ], 'p3': [ DF(DF.day + DF.month + DF.year), # DMY DF(DF.year + DF.month + DF.day), # YMD DF(DF.month + DF.day + DF.year), # MDY DF(DF.year + DF.day + DF.month), # YDM DF(DF.day + DF.year + DF.month), # DYM DF(DF.month + DF.year + DF.day), # MYD ] } PRIORITIES_US = { 'p2': [ DF(DF.month + DF.day), DF(DF.year + DF.month), DF(DF.day + DF.month), DF(DF.month + DF.year), DF(DF.day + DF.year), DF(DF.year + DF.day), ], 'p3': [ DF(DF.month + DF.day + DF.year), DF(DF.year + DF.day + DF.month), DF(DF.day + DF.month + DF.year), DF(DF.year + DF.month + DF.day), DF(DF.day + DF.year + DF.month), DF(DF.month + DF.year + DF.day), ] } def guess_date(tokens, priorities=None): """Guess the date from a string in an unknown format. The method uses the following clues to guess the meaning of each part: * 4-digits implies it's a year * 1-digit discards year (since it's more common to write 04 instead of 4 as a shorthand to 2004 * Literal month * 'of' is preceeded by day and succeeded by the month * Any of (st, nd, rd, th) on a number makes it a day * Number > 12 can't be a month * Number > 31 can't be a day * Date inexistence (e.g. 29.02.2018) * precedence DMY > YMD > MDY > YDM * precedence DM > MY > YM > MD """ priorities = PRIORITIES_INT if priorities is None else priorities # We need to figure out which token corresponds to which component # (D, M, Y). Since this is ambiguous, guesswork is needed. We do so # by eliminating impossible options. # initially, all three components are viable guesses = [Struct(tok=tok.strip(), fmt=DF.year + DF.month + DF.day) for tok in tokens] # check indicators for specific formats for idx in range(len(guesses)): tok, options = guesses[idx].tok, guesses[idx].fmt if len(tok) == 1 and tok in '.,;': # delimiter tokens can be ignored guesses[idx].fmt = '' elif tok == 'of': # an 'of' token indicates a 'day of month' structure guesses[idx-1].fmt = DF.day guesses[idx+1].fmt = DF.month guesses[idx].fmt = '' elif tok[-2:] in ('st', 'nd', 'rd', 'th'): # suffix indicates a day guesses[idx].fmt = DF.day guesses[idx].tok = tok[:-2] elif len(tok) == 4 and tok.isdigit(): # four digits must be a year guesses[idx].fmt = DF.year elif tok in MONTH_LIT: # spelled out month is - tadaaa - a month guesses[idx].tok = str(MONTH_LIT[tok]) guesses[idx].fmt = DF.month # remove jitter (of, delimiters) guesses = [itm for itm in guesses if len(itm.fmt) > 0] # eliminate impossible options for idx in range(len(guesses)): tok, options = guesses[idx].tok, guesses[idx].fmt if len(tok) == 1: # can't be a year guesses[idx].fmt = guesses[idx].fmt.replace(DF.year, '') if tok.isdigit() and int(tok) > 12: # can't be a month guesses[idx].fmt = guesses[idx].fmt.replace(DF.month, '') if tok.isdigit() and int(tok) > 31: # can't be a day guesses[idx].fmt = guesses[idx].fmt.replace(DF.day, '') # define helper function def create_date(year, month, day): """Return a datetime for the given components or None if that is not possible.""" # check format if DF.year not in year.fmt or DF.month not in month.fmt or DF.day not in day.fmt: return None if len(str(year.tok)) == 2: # ten years into the future is still the current century, otherwise the previous one threshold = ddate.today().year + 10 - 2000 year = Struct( tok='20'+str(year.tok) if int(year.tok) < threshold else '19'+str(year.tok), fmt=year.fmt ) try: # create date return ddate(year=int(year.tok), month=int(month.tok), day=int(day.tok)) except ValueError: return None # placeholders for unspecified tokens pyear = Struct(tok=ddate.today().year, fmt=DF.year) pday = Struct(tok=1, fmt=DF.day) pmon = Struct(tok=1, fmt=DF.month) if len(guesses) == 1: # one-part date (Y) itm = guesses[0] date = create_date(itm, pmon, pday) if date is not None: return date, DF(DF.year) else: raise DateParserError('Two-digit date format must contain the year') elif len(guesses) == 2: # two-part date (DM, MY, YM, MD) fst, snd = guesses # check components if len(set(fst.fmt + snd.fmt)) < 2: raise DateParserError('Invalid two-digit date format') if len(fst.fmt) == 1 and len(snd.fmt) == 1: # fully determined date = { DF.year: pyear, DF.month: pmon, DF.day: pday, } date.update({ fst.fmt: fst, snd.fmt: snd, }) return create_date(date[DF.year], date[DF.month], date[DF.day]), DF(fst.fmt + snd.fmt) # walk through prioritized formats formats = { DF(DF.day + DF.month): create_date(pyear, snd, fst), # DM DF(DF.month + DF.year): create_date(snd, fst, pday), # MY DF(DF.year + DF.month): create_date(fst, snd, pday), # YM DF(DF.month + DF.day): create_date(pyear, fst, snd), # MD DF(DF.day + DF.year): create_date(snd, pmon, fst), # DY DF(DF.year + DF.day): create_date(fst, pmon, snd), # YD } for fmt in priorities['p2']: if formats.get(fmt, None) is not None: return formats[fmt], fmt raise DateParserError('Cannot guess roles of a two-digit date format') elif len(guesses) == 3: # three-part date (DMY, YMD, MDY, YMD) # eliminate options based on uniqueness of component assignment changed = True while changed: # resolved guesses: item has only one possible component option resolved = set([itm.fmt for itm in guesses if len(itm.fmt) == 1]) # single choice: component has only one possible position unique = {comp for comp, freq in Counter(flatten([set(itm.fmt) for itm in guesses])).items() if freq == 1} # assume no changes changed = False for itm in guesses: if unique & set(itm.fmt) and not set(itm.fmt).issubset(unique): # itm is the only option for one component itm.fmt = DF(''.join(unique & set(itm.fmt))) changed = True elif resolved & set(itm.fmt) and not set(itm.fmt).issubset(resolved): # itm contains options that already taken by a different item itm.fmt = itm.fmt.translate(str.maketrans('', '', ''.join(resolved))) changed = True fst, snd, trd = guesses # check components if len(set(fst.fmt + snd.fmt + trd.fmt)) < 3: raise DateParserError('Invalid three-digit date format') if len(fst.fmt) == 1 and len(snd.fmt) == 1 and len(trd.fmt) == 1: # fully determined date = { fst.fmt: fst, snd.fmt: snd, trd.fmt: trd, } return (create_date(date[DF.year], date[DF.month], date[DF.day]), DF(fst.fmt + snd.fmt + trd.fmt)) # walk through prioritized formats formats = { DF(DF.day + DF.month + DF.year): create_date(year=trd, month=snd, day=fst), # DMY DF(DF.year + DF.month + DF.day): create_date(year=fst, month=snd, day=trd), # YMD DF(DF.month + DF.day + DF.year): create_date(year=trd, month=fst, day=snd), # MDY DF(DF.year + DF.day + DF.month): create_date(year=fst, month=trd, day=snd), # YDM DF(DF.day + DF.year + DF.month): create_date(year=snd, month=trd, day=fst), # DYM DF(DF.month + DF.year + DF.day): create_date(year=snd, month=fst, day=trd), # MYD } for fmt in priorities['p3']: if formats.get(fmt, None) is not None: return formats[fmt], fmt raise DateParserError('Cannot guess the roles of a three-digit date format') raise DateParserError('Cannot parse the date format') def guess_time(tokens): """Guess the time from a string in an unknown format. * Always sorted from hi (hour) to low (sec) * 4 Terms -> hour, min, sec, ns * 3 Terms -> hour, min, sec * 2 Terms -> hour, min | min, sec * both terms > 24 -> min, sec * am or pm present -> hour, min * Dot separation -> min, sec * Colon separation -> hour, min """ # remove spearators tokens = [tok.lower() for tok in tokens if tok not in '.,:'] # check if the am/pm format was used is_am = 'am' in tokens is_pm = 'pm' in tokens # remove non-numbers tokens = [tok for tok in tokens if tok.isdigit()] if not len(tokens): raise TimeParserError() # convert to int ms = int(tokens[-1].ljust(6, '0')) tokens = [int(tok) for tok in tokens] # guess format try: if len(tokens) == 4: # H:M:S.NS tokens[-1] = ms return dtime(*tokens), DF(DF.hour + DF.minute + DF.second + DF.microsecond) elif len(tokens) == 3: # H:M:S return dtime(*tokens), DF(DF.hour + DF.minute + DF.second) elif len(tokens) == 2: # H:M or M:S if is_am: # am/pm notation was used return dtime(*tokens), DF(DF.hour + DF.minute) elif is_pm: # am/pm notation was used return dtime(tokens[0] + 12, tokens[1]), DF(DF.hour + DF.minute) elif tokens[0] > 24: # min, sec return dtime(0, *tokens), DF(DF.minute + DF.second) else: # hour, sec return dtime(*tokens), DF(DF.hour + DF.minute) elif len(tokens) == 1: # H if is_am: # am/pm notation was used return dtime(tokens[0]), DF(DF.hour) elif is_pm: # am/pm notation was used return dtime(tokens[0] + 12), DF(DF.hour) else: return dtime(tokens[0], 0), DF(DF.hour) except ValueError: # invalid value was supplied, e.g. hour=85 raise TimeParserError('Invalid value', tokens) raise TimeParserError('Unknown time format', tokens) def guess_datetime(exp): """Return a datetime instance by guessing the components of a DATETIME parsed user-supplied date and/or time string. Guessing might be necessary since dates like 10.11.12 are ambiguous. *exp* is supposed to be a pyparsing.ParseResults instance as returned by DATETIME.parseString(...). """ # For now I assumed unique separators (dot for date, colon for time, comma to separate the two) if 'date' in exp and 'time' in exp: # both parts present date, dfmt = guess_date(exp.date) time, tfmt = guess_time(exp.time) return datetime.combine(date, time), DF(dfmt+tfmt) elif 'date' in exp: # date present date, dfmt = guess_date(exp.date) return datetime(date.year, date.month, date.day), dfmt elif 'time' in exp: # time present time, tfmt = guess_time(exp.time) return datetime.combine(ddate.fromtimestamp(0), time), tfmt else: raise DateFormatError('Neither a date nor a time was found.') def increment(date, fmt): """Increment the LSB of a datetime instance by one.""" if fmt == '' or not fmt.valid(): raise DateFormatError('Invalid date format string', fmt) elif fmt.lsb() == fmt.microsecond: # 5.11.2012, 06:24:18.25 -> 5.11.2012, 06:25:18.26 return date + relativedelta(microseconds=1) elif fmt.lsb() == fmt.second: # 5.11.2012, 06:24:18 -> 5.11.2012, 06:25:19 return date + relativedelta(seconds=1, microsecond=0) elif fmt.lsb() == fmt.minute: # 5.11.2012, 06:24 -> 5.11.2012, 06:25 return date + relativedelta(minutes=1, second=0, microsecond=0) elif fmt.lsb() == fmt.hour: # 5.11.2012, 06am -> 5.11.2012, 07:00 return date + relativedelta(hours=1, minute=0, second=0, microsecond=0) elif fmt.lsb() == fmt.day: # 5.11.2012 -> 6.11.2012, 00:00 return date + relativedelta(days=1, hour=0, minute=0, second=0, microsecond=0) elif fmt.lsb() == fmt.month: # 11.2012 -> 1.12.2012 return date + relativedelta(months=1, day=1, hour=0, minute=0, second=0, microsecond=0) else: # fmt.lsb() == fmt.year: # 2012 -> 1.1.2013, 00:00 return date + relativedelta( years=1, month=1, day=1, hour=0, minute=0, second=0, microsecond=0) class DateTimeParser(): DATE = None TIME = None DATETIME = None def build_parser(self): """ DATE := YMD | DMY | MDY | YDM YMD := YEAR SEP MON SEP DAY DMY := DAY SEP [of] MON SEP YEAR MDY := MON SEP DAY SEP YEAR YDM := YEAR SEP DAY [of] MON DM := DAY SEP [of] MON YM := YEAR SEP MON MY := MON SEP YEAR MD := MON SEP DAY DAY := [D]D | [D]D st | [D]D nd | [D]D rd | [D]D th MON := [M]M | [month] YEAR := [YY]YY SEP := . | , | [whitespace] {D,M,Y} := [digit] """ # FIXME: Allow more patterns (e.g. 2012, 10; April, 5th; April, 2020) sep = Literal('.') # FIXME: Allow '. - :' year = Word(nums, exact=2) ^ Word(nums, exact=4) month = Word(nums, min=1, max=2) ^ oneOf(list(MONTH_LIT.keys())) day = Combine(Word(nums, min=1, max=2) + Optional(oneOf('st nd rd th'))) # three-part-date YMD = year + sep + month + sep + day DMY = day + (sep ^ 'of') + month + sep + year MDY = month + sep + day + sep + year YDM = year + sep + day + (sep ^ 'of') + month # two-part-date DM = day + (sep ^ 'of')+ month YM = year + sep + month MY = month + sep + year MD = month + sep + day Y = Word(nums, exact=4) # date parser self.DATE = Group(YMD | DMY | YDM | MDY | DM | YM | MY | MD | Y).setResultsName('date') """ TIME := HOUR SEP MIN [SEP SEC [. MS]] | HOUR SEP MIN | HOUR [SEP MIN] {am|pm} HOUR := [H]H MIN := [M]M SEC := [S]S {H,M,S} := [digit] SEP := : | . | , """ sep = Literal(':') # FIXME: Allow '. : -' HMS = Word(nums, min=1, max=2) MS = Word(nums, min=1) # time parser self.TIME = Group(HMS + sep + HMS + sep + HMS + oneOf('. :') + MS \ | HMS + sep + HMS + sep + HMS \ | HMS + Optional(sep + HMS) + oneOf('am pm') \ | HMS + sep + HMS ).setResultsName('time') """ DATETIME := DATE | TIME | DATE SEP TIME | TIME SEP DATE SEP := , [whitespace] """ self.DATETIME = Group( self.DATE \ ^ self.TIME \ ^ self.DATE + Optional(',') + self.TIME \ ^ self.TIME + Optional(',') + self.DATE \ ).setResultsName('datetime') return self def __call__(self, datestr): if self.DATETIME is None: self.build_parser() try: date, fmt = guess_datetime(self.DATETIME.parseString(datestr, parseAll=True)[0]) return date except ParseException as e: raise errors.ParserError('Cannot parse query', e) """Default DateTimeParser instance. To produce an datetime, call >>> parse_datetime(datestring) Convenience shortcut for >>> DateTimeParser().parse(datestring) """ parse_datetime = DateTimeParser().build_parser() ## EOF ##