aboutsummaryrefslogtreecommitdiffstats
path: root/tagit/parsing/datefmt.py
blob: 49de1c062f9bb0456cb6e49510316af0953bf1fa (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
"""Parse and interpret date strings.

Consider the following date notations (DMY=04.11.2012):

DMY     04.11.12        europe
YMD     12.11.04        iso
MDY     11.04.12        US
YDM     12.04.11        reverse US
DYM     04.12.11        too uncommon, ignored
MYD     11.12.04        too uncommon, ignored

There's the general problem of ambiguity between the DMY and MDY formats.
Here, we give precedence to the DMY format.

Note that the MDY format can still be used in unambiguous settings or
with the month spelled out, e.g. "2012, 23th of November"

Similarly, consider the following shortened date notations:

DM      04.11           europe, current year
MY      11.12           quarters
YM      12.11           quarters
MD      11.04           us, current year
DY      23.12           too uncommon, ignored
YD      12.23           too uncommon, ignored

In addition to the different spellings, month names can be spelled out
and the string can be cluttered with additional common words.

Part of the tagit module.
A copy of the license is provided with the project.
Author: Matthias Baumgartner, 2022
"""
# standard imports
from collections import Counter
from datetime import date as ddate, time as dtime, datetime, timedelta
from math import floor

# external imports
from dateutil.relativedelta import relativedelta
from pyparsing import Combine, Group, Literal, Optional, Or, Word, nums, oneOf, ParseException

# tagit imports
from tagit.utils import errors, Struct, flatten

# exports
__all__ = (
        # default format strings
        'DATE_FMT',
        'TIME_FMT',
        'DATETIME_FMT',
        # exceptions
        'DateParserError',
        'TimeParserError',
        'DateFormatError'
        # parsing
        'parse_datetime',
        'guess_datetime',
        # postprocessing
        'increment',
        )

## constants ##
"""Default strftime format strings."""
DATE_FMT = '%d.%m.%Y'
TIME_FMT = '%H:%M'
DATETIME_FMT = '%d.%m.%Y, %H:%M'

# Literal months
MONTH_LIT = {
    'Jan'       : 1,
    'January'   : 1,
    'Feb'       : 2,
    'February'  : 2,
    'Mar'       : 3,
    'March'     : 3,
    'Apr'       : 4,
    'April'     : 4,
    'May'       : 5,
    'Jun'       : 6,
    'June'      : 6,
    'Jul'       : 7,
    'July'      : 7,
    'Aug'       : 8,
    'August'    : 8,
    'Sep'       : 9,
    'September' : 9,
    'Oct'       : 10,
    'October'   : 10,
    'Nov'       : 11,
    'November'  : 11,
    'Dec'       : 12,
    'December'  : 12,
    }


## code ##

class DatefmtError(errors.ParserError): pass

class DateParserError(DatefmtError): pass

class TimeParserError(DatefmtError): pass

class DateFormatError(DatefmtError): pass

class DF(str):
    """date/time user-supplied format."""
    # indicator characters, highest to lowest.
    _valid_chars = "YMDhmsn"
    # explicit mapping from unit to character
    year                = 'Y'
    month               = 'M'
    day                 = 'D'
    hour                = 'h'
    minute              = 'm'
    second              = 's'
    microsecond         = 'n'

    def valid(self):
        return len(self) and len(set(self._valid_chars) & set(self))

    def lsb(self):
        """Smallest unit specified."""
        if not self.valid():
            raise DateFormatError(
                'An empty date format string has no least significant position.', self)

        return [i for i in self._valid_chars if i in self][-1]

    def msb(self):
        """Highest unit specified."""
        if not self.valid():
            raise DateFormatError(
                'An empty date format string has no most significant position.', self)

        return [i for i in self._valid_chars if i in self][0]

    def is_time(self):
        """Return true if only a time (hour/minute/second/ms) was specified."""
        return True if self.valid() and self.msb() not in 'YMD' else False

    def is_date(self):
        """Return true if only a date (year/month/day) was specified."""
        return True if self.valid() and self.lsb() not in 'hmsn' else False

# priorities
PRIORITIES_INT = {
    'p2': [
        DF(DF.day + DF.month),  # DM
        DF(DF.month + DF.year), # MY
        DF(DF.year + DF.month), # YM
        DF(DF.month + DF.day),  # MD
        DF(DF.day + DF.year),   # DY
        DF(DF.year + DF.day),   # YD
        ],
    'p3': [
        DF(DF.day + DF.month + DF.year), # DMY
        DF(DF.year + DF.month + DF.day), # YMD
        DF(DF.month + DF.day + DF.year), # MDY
        DF(DF.year + DF.day + DF.month), # YDM
        DF(DF.day + DF.year + DF.month), # DYM
        DF(DF.month + DF.year + DF.day), # MYD
        ]
    }

PRIORITIES_US = {
    'p2': [
        DF(DF.month + DF.day),
        DF(DF.year + DF.month),
        DF(DF.day + DF.month),
        DF(DF.month + DF.year),
        DF(DF.day + DF.year),
        DF(DF.year + DF.day),
        ],
    'p3': [
        DF(DF.month + DF.day + DF.year),
        DF(DF.year + DF.day + DF.month),
        DF(DF.day + DF.month + DF.year),
        DF(DF.year + DF.month + DF.day),
        DF(DF.day + DF.year + DF.month),
        DF(DF.month + DF.year + DF.day),
        ]
    }

def guess_date(tokens, priorities=None):
    """Guess the date from a string in an unknown format.

    The method uses the following clues to guess the meaning of each part:
    * 4-digits implies it's a year
    * 1-digit discards year (since it's more common to write 04 instead of 4 as a shorthand to 2004
    * Literal month
    * 'of' is preceeded by day and succeeded by the month
    * Any of (st, nd, rd, th) on a number makes it a day
    * Number > 12 can't be a month
    * Number > 31 can't be a day
    * Date inexistence (e.g. 29.02.2018)
    * precedence DMY > YMD > MDY > YDM
    * precedence DM > MY > YM > MD
    """
    priorities = PRIORITIES_INT if priorities is None else priorities

    # We need to figure out which token corresponds to which component
    # (D, M, Y). Since this is ambiguous, guesswork is needed. We do so
    # by eliminating impossible options.

    # initially, all three components are viable
    guesses = [Struct(tok=tok.strip(), fmt=DF.year + DF.month + DF.day) for tok in tokens]

    # check indicators for specific formats
    for idx in range(len(guesses)):
        tok, options = guesses[idx].tok, guesses[idx].fmt

        if len(tok) == 1 and tok in '.,;':
            # delimiter tokens can be ignored
            guesses[idx].fmt = ''
        elif tok == 'of':
            # an 'of' token indicates a 'day of month' structure
            guesses[idx-1].fmt = DF.day
            guesses[idx+1].fmt = DF.month
            guesses[idx].fmt = ''
        elif tok[-2:] in ('st', 'nd', 'rd', 'th'):
            # suffix indicates a day
            guesses[idx].fmt = DF.day
            guesses[idx].tok = tok[:-2]
        elif len(tok) == 4 and tok.isdigit():
            # four digits must be a year
            guesses[idx].fmt = DF.year
        elif tok in MONTH_LIT:
            # spelled out month is - tadaaa - a month
            guesses[idx].tok = str(MONTH_LIT[tok])
            guesses[idx].fmt = DF.month

    # remove jitter (of, delimiters)
    guesses = [itm for itm in guesses if len(itm.fmt) > 0]

    # eliminate impossible options
    for idx in range(len(guesses)):
        tok, options = guesses[idx].tok, guesses[idx].fmt

        if len(tok) == 1:
            # can't be a year
            guesses[idx].fmt = guesses[idx].fmt.replace(DF.year, '')
        if tok.isdigit() and int(tok) > 12:
            # can't be a month
            guesses[idx].fmt = guesses[idx].fmt.replace(DF.month, '')
        if tok.isdigit() and int(tok) > 31:
            # can't be a day
            guesses[idx].fmt = guesses[idx].fmt.replace(DF.day, '')

    # define helper function
    def create_date(year, month, day):
        """Return a datetime for the given components or None if that is not possible."""
        # check format
        if DF.year not in year.fmt or DF.month not in month.fmt or DF.day not in day.fmt:
            return None

        if len(str(year.tok)) == 2:
            # ten years into the future is still the current century, otherwise the previous one
            threshold = ddate.today().year + 10 - 2000
            year = Struct(
                    tok='20'+str(year.tok) if int(year.tok) < threshold else '19'+str(year.tok),
                    fmt=year.fmt
                    )

        try:
            # create date
            return ddate(year=int(year.tok), month=int(month.tok), day=int(day.tok))
        except ValueError:
            return None

    # placeholders for unspecified tokens
    pyear = Struct(tok=ddate.today().year, fmt=DF.year)
    pday = Struct(tok=1, fmt=DF.day)
    pmon = Struct(tok=1, fmt=DF.month)

    if len(guesses) == 1: # one-part date (Y)
        itm = guesses[0]
        date = create_date(itm, pmon, pday)
        if date is not None:
            return date, DF(DF.year)
        else:
            raise DateParserError('Two-digit date format must contain the year')

    elif len(guesses) == 2: # two-part date (DM, MY, YM, MD)
        fst, snd = guesses
        # check components
        if len(set(fst.fmt + snd.fmt)) < 2:
            raise DateParserError('Invalid two-digit date format')

        if len(fst.fmt) == 1 and len(snd.fmt) == 1: # fully determined
            date = {
                DF.year: pyear,
                DF.month: pmon,
                DF.day: pday,
                }
            date.update({
                fst.fmt: fst,
                snd.fmt: snd,
                })
            return create_date(date[DF.year], date[DF.month], date[DF.day]), DF(fst.fmt + snd.fmt)

        # walk through prioritized formats
        formats = {
            DF(DF.day + DF.month):  create_date(pyear, snd, fst), # DM
            DF(DF.month + DF.year): create_date(snd, fst, pday),  # MY
            DF(DF.year + DF.month): create_date(fst, snd, pday),  # YM
            DF(DF.month + DF.day):  create_date(pyear, fst, snd), # MD
            DF(DF.day + DF.year):   create_date(snd, pmon, fst),  # DY
            DF(DF.year + DF.day):   create_date(fst, pmon, snd),  # YD
        }

        for fmt in priorities['p2']:
            if formats.get(fmt, None) is not None:
                return formats[fmt], fmt

        raise DateParserError('Cannot guess roles of a two-digit date format')

    elif len(guesses) == 3: # three-part date (DMY, YMD, MDY, YMD)

        # eliminate options based on uniqueness of component assignment
        changed = True
        while changed:
            # resolved guesses: item has only one possible component option
            resolved = set([itm.fmt for itm in guesses if len(itm.fmt) == 1])
            # single choice: component has only one possible position
            unique = {comp for comp, freq in
                      Counter(flatten([set(itm.fmt) for itm in guesses])).items()
                      if freq == 1}
            # assume no changes
            changed = False
            for itm in guesses:
                if unique & set(itm.fmt) and not set(itm.fmt).issubset(unique):
                    # itm is the only option for one component
                    itm.fmt = DF(''.join(unique & set(itm.fmt)))
                    changed = True
                elif resolved & set(itm.fmt) and not set(itm.fmt).issubset(resolved):
                    # itm contains options that already taken by a different item
                    itm.fmt = itm.fmt.translate(str.maketrans('', '', ''.join(resolved)))
                    changed = True

        fst, snd, trd = guesses

        # check components
        if len(set(fst.fmt + snd.fmt + trd.fmt)) < 3:
            raise DateParserError('Invalid three-digit date format')

        if len(fst.fmt) == 1 and len(snd.fmt) == 1 and len(trd.fmt) == 1: # fully determined
            date = {
                fst.fmt: fst,
                snd.fmt: snd,
                trd.fmt: trd,
                }
            return (create_date(date[DF.year], date[DF.month], date[DF.day]),
                    DF(fst.fmt + snd.fmt + trd.fmt))

        # walk through prioritized formats
        formats = {
            DF(DF.day + DF.month + DF.year): create_date(year=trd, month=snd, day=fst), # DMY
            DF(DF.year + DF.month + DF.day): create_date(year=fst, month=snd, day=trd), # YMD
            DF(DF.month + DF.day + DF.year): create_date(year=trd, month=fst, day=snd), # MDY
            DF(DF.year + DF.day + DF.month): create_date(year=fst, month=trd, day=snd), # YDM
            DF(DF.day + DF.year + DF.month): create_date(year=snd, month=trd, day=fst), # DYM
            DF(DF.month + DF.year + DF.day): create_date(year=snd, month=fst, day=trd), # MYD
        }

        for fmt in priorities['p3']:
            if formats.get(fmt, None) is not None:
                return formats[fmt], fmt

        raise DateParserError('Cannot guess the roles of a three-digit date format')

    raise DateParserError('Cannot parse the date format')

def guess_time(tokens):
    """Guess the time from a string in an unknown format.

    * Always sorted from hi (hour) to low (sec)
    * 4 Terms -> hour, min, sec, ns
    * 3 Terms -> hour, min, sec
    * 2 Terms -> hour, min | min, sec
      * both terms > 24 -> min, sec
      * am or pm present -> hour, min
      * Dot separation -> min, sec
      * Colon separation -> hour, min
    """
    # remove spearators
    tokens = [tok.lower() for tok in tokens if tok not in '.,:']
    # check if the am/pm format was used
    is_am = 'am' in tokens
    is_pm = 'pm' in tokens

    # remove non-numbers
    tokens = [tok for tok in tokens if tok.isdigit()]
    if not len(tokens):
        raise TimeParserError()

    # convert to int
    ms     = int(tokens[-1].ljust(6, '0'))
    tokens = [int(tok) for tok in tokens]

    # guess format
    try:
        if len(tokens) == 4: # H:M:S.NS
            tokens[-1] = ms
            return dtime(*tokens), DF(DF.hour + DF.minute + DF.second + DF.microsecond)
        elif len(tokens) == 3: # H:M:S
            return dtime(*tokens), DF(DF.hour + DF.minute + DF.second)
        elif len(tokens) == 2: # H:M or M:S
            if is_am: # am/pm notation was used
                return dtime(*tokens), DF(DF.hour + DF.minute)
            elif is_pm: # am/pm notation was used
                return dtime(tokens[0] + 12, tokens[1]), DF(DF.hour + DF.minute)
            elif tokens[0] > 24: # min, sec
                return dtime(0, *tokens), DF(DF.minute + DF.second)
            else: # hour, sec
                return dtime(*tokens), DF(DF.hour + DF.minute)
        elif len(tokens) == 1: # H
            if is_am: # am/pm notation was used
                return dtime(tokens[0]), DF(DF.hour)
            elif is_pm: # am/pm notation was used
                return dtime(tokens[0] + 12), DF(DF.hour)
            else:
                return dtime(tokens[0], 0), DF(DF.hour)

    except ValueError:
        # invalid value was supplied, e.g. hour=85
        raise TimeParserError('Invalid value', tokens)

    raise TimeParserError('Unknown time format', tokens)

def guess_datetime(exp):
    """Return a datetime instance by guessing the components of a DATETIME parsed
    user-supplied date and/or time string. Guessing might be necessary since dates
    like 10.11.12 are ambiguous. *exp* is supposed to be a pyparsing.ParseResults
    instance as returned by DATETIME.parseString(...).
    """
    # For now I assumed unique separators (dot for date, colon for time, comma to separate the two)
    if 'date' in exp and 'time' in exp: # both parts present
        date, dfmt = guess_date(exp.date)
        time, tfmt = guess_time(exp.time)
        return datetime.combine(date, time), DF(dfmt+tfmt)
    elif 'date' in exp: # date present
        date, dfmt = guess_date(exp.date)
        return datetime(date.year, date.month, date.day), dfmt
    elif 'time' in exp: # time present
        time, tfmt = guess_time(exp.time)
        return datetime.combine(ddate.fromtimestamp(0), time), tfmt
    else:
        raise DateFormatError('Neither a date nor a time was found.')

def increment(date, fmt):
    """Increment the LSB of a datetime instance by one."""
    if fmt == '' or not fmt.valid():
        raise DateFormatError('Invalid date format string', fmt)
    elif fmt.lsb() == fmt.microsecond: # 5.11.2012, 06:24:18.25 -> 5.11.2012, 06:25:18.26
        return date + relativedelta(microseconds=1)
    elif fmt.lsb() == fmt.second: # 5.11.2012, 06:24:18 -> 5.11.2012, 06:25:19
        return date + relativedelta(seconds=1, microsecond=0)
    elif fmt.lsb() == fmt.minute: # 5.11.2012, 06:24 -> 5.11.2012, 06:25
        return date + relativedelta(minutes=1, second=0, microsecond=0)
    elif fmt.lsb() == fmt.hour: # 5.11.2012, 06am -> 5.11.2012, 07:00
        return date + relativedelta(hours=1, minute=0, second=0, microsecond=0)
    elif fmt.lsb() == fmt.day: # 5.11.2012 -> 6.11.2012, 00:00
        return date + relativedelta(days=1, hour=0, minute=0, second=0, microsecond=0)
    elif fmt.lsb() == fmt.month: # 11.2012 -> 1.12.2012
        return date + relativedelta(months=1, day=1, hour=0, minute=0, second=0, microsecond=0)
    else: # fmt.lsb() == fmt.year: # 2012 -> 1.1.2013, 00:00
        return date + relativedelta(
                years=1, month=1, day=1, hour=0, minute=0, second=0, microsecond=0)

class DateTimeParser():

    DATE = None
    TIME = None
    DATETIME = None

    def build_parser(self):
        """
        DATE        := YMD | DMY | MDY | YDM
        YMD         := YEAR SEP MON SEP DAY
        DMY         := DAY SEP [of] MON SEP YEAR
        MDY         := MON SEP DAY SEP YEAR
        YDM         := YEAR SEP DAY [of] MON
        DM          := DAY SEP [of] MON
        YM          := YEAR SEP MON
        MY          := MON SEP YEAR
        MD          := MON SEP DAY
        DAY         := [D]D | [D]D st | [D]D nd | [D]D rd | [D]D th
        MON         := [M]M | [month]
        YEAR        := [YY]YY
        SEP         := . | , | [whitespace]
        {D,M,Y}     := [digit]
        """
        # FIXME: Allow more patterns (e.g. 2012, 10; April, 5th; April, 2020)
        sep = Literal('.') # FIXME: Allow '. - :'
        year = Word(nums, exact=2) ^ Word(nums, exact=4)
        month = Word(nums, min=1, max=2) ^ oneOf(list(MONTH_LIT.keys()))
        day = Combine(Word(nums, min=1, max=2) + Optional(oneOf('st nd rd th')))
        # three-part-date
        YMD     = year + sep + month + sep + day
        DMY     = day + (sep ^ 'of') + month + sep + year
        MDY     = month + sep + day + sep + year
        YDM     = year + sep + day + (sep ^ 'of') + month
        # two-part-date
        DM      = day + (sep ^ 'of')+ month
        YM      = year + sep + month
        MY      = month + sep + year
        MD      = month + sep + day
        Y       = Word(nums, exact=4)
        # date parser
        self.DATE = Group(YMD | DMY | YDM | MDY | DM | YM | MY | MD | Y).setResultsName('date')

        """
        TIME        := HOUR SEP MIN [SEP SEC [. MS]] | HOUR SEP MIN | HOUR [SEP MIN] {am|pm}
        HOUR        := [H]H
        MIN         := [M]M
        SEC         := [S]S
        {H,M,S}     := [digit]
        SEP         := : | . | ,
        """
        sep = Literal(':') # FIXME: Allow '. : -'
        HMS = Word(nums, min=1, max=2)
        MS = Word(nums, min=1)
        # time parser
        self.TIME = Group(HMS + sep + HMS + sep + HMS + oneOf('. :') + MS \
                | HMS + sep + HMS + sep + HMS \
                | HMS + Optional(sep + HMS) + oneOf('am pm') \
                | HMS + sep + HMS ).setResultsName('time')

        """
        DATETIME    := DATE | TIME | DATE SEP TIME | TIME SEP DATE
        SEP         := , [whitespace]
        """
        self.DATETIME = Group(
                        self.DATE \
                        ^ self.TIME \
                        ^ self.DATE + Optional(',') + self.TIME \
                        ^ self.TIME + Optional(',') + self.DATE \
                        ).setResultsName('datetime')
        return self

    def __call__(self, datestr):
        if self.DATETIME is None:
            self.build_parser()

        try:
            date, fmt = guess_datetime(self.DATETIME.parseString(datestr, parseAll=True)[0])
            return date

        except ParseException as e:
            raise errors.ParserError('Cannot parse query', e)


"""Default DateTimeParser instance.

To produce an datetime, call

>>> parse_datetime(datestring)

Convenience shortcut for

>>> DateTimeParser().parse(datestring)

"""
parse_datetime = DateTimeParser().build_parser()

## EOF ##