1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
|
"""Parse and interpret date strings.
Consider the following date notations (DMY=04.11.2012):
DMY 04.11.12 europe
YMD 12.11.04 iso
MDY 11.04.12 US
YDM 12.04.11 reverse US
DYM 04.12.11 too uncommon, ignored
MYD 11.12.04 too uncommon, ignored
There's the general problem of ambiguity between the DMY and MDY formats.
Here, we give precedence to the DMY format.
Note that the MDY format can still be used in unambiguous settings or
with the month spelled out, e.g. "2012, 23th of November"
Similarly, consider the following shortened date notations:
DM 04.11 europe, current year
MY 11.12 quarters
YM 12.11 quarters
MD 11.04 us, current year
DY 23.12 too uncommon, ignored
YD 12.23 too uncommon, ignored
In addition to the different spellings, month names can be spelled out
and the string can be cluttered with additional common words.
Part of the tagit module.
A copy of the license is provided with the project.
Author: Matthias Baumgartner, 2022
"""
# standard imports
from collections import Counter
from datetime import date as ddate, time as dtime, datetime, timedelta
from math import floor
# external imports
from dateutil.relativedelta import relativedelta
from pyparsing import Combine, Group, Literal, Optional, Or, Word, nums, oneOf, ParseException
# tagit imports
from tagit.utils import errors, Struct, flatten
# exports
__all__ = (
# default format strings
'DATE_FMT',
'TIME_FMT',
'DATETIME_FMT',
# exceptions
'DateParserError',
'TimeParserError',
'DateFormatError'
# parsing
'parse_datetime',
'guess_datetime',
# postprocessing
'increment',
)
## constants ##
"""Default strftime format strings."""
DATE_FMT = '%d.%m.%Y'
TIME_FMT = '%H:%M'
DATETIME_FMT = '%d.%m.%Y, %H:%M'
# Literal months
MONTH_LIT = {
'Jan' : 1,
'January' : 1,
'Feb' : 2,
'February' : 2,
'Mar' : 3,
'March' : 3,
'Apr' : 4,
'April' : 4,
'May' : 5,
'Jun' : 6,
'June' : 6,
'Jul' : 7,
'July' : 7,
'Aug' : 8,
'August' : 8,
'Sep' : 9,
'September' : 9,
'Oct' : 10,
'October' : 10,
'Nov' : 11,
'November' : 11,
'Dec' : 12,
'December' : 12,
}
## code ##
class DatefmtError(errors.ParserError): pass
class DateParserError(DatefmtError): pass
class TimeParserError(DatefmtError): pass
class DateFormatError(DatefmtError): pass
class DF(str):
"""date/time user-supplied format."""
# indicator characters, highest to lowest.
_valid_chars = "YMDhmsn"
# explicit mapping from unit to character
year = 'Y'
month = 'M'
day = 'D'
hour = 'h'
minute = 'm'
second = 's'
microsecond = 'n'
def valid(self):
return len(self) and len(set(self._valid_chars) & set(self))
def lsb(self):
"""Smallest unit specified."""
if not self.valid():
raise DateFormatError(
'An empty date format string has no least significant position.', self)
return [i for i in self._valid_chars if i in self][-1]
def msb(self):
"""Highest unit specified."""
if not self.valid():
raise DateFormatError(
'An empty date format string has no most significant position.', self)
return [i for i in self._valid_chars if i in self][0]
def is_time(self):
"""Return true if only a time (hour/minute/second/ms) was specified."""
return True if self.valid() and self.msb() not in 'YMD' else False
def is_date(self):
"""Return true if only a date (year/month/day) was specified."""
return True if self.valid() and self.lsb() not in 'hmsn' else False
# priorities
PRIORITIES_INT = {
'p2': [
DF(DF.day + DF.month), # DM
DF(DF.month + DF.year), # MY
DF(DF.year + DF.month), # YM
DF(DF.month + DF.day), # MD
DF(DF.day + DF.year), # DY
DF(DF.year + DF.day), # YD
],
'p3': [
DF(DF.day + DF.month + DF.year), # DMY
DF(DF.year + DF.month + DF.day), # YMD
DF(DF.month + DF.day + DF.year), # MDY
DF(DF.year + DF.day + DF.month), # YDM
DF(DF.day + DF.year + DF.month), # DYM
DF(DF.month + DF.year + DF.day), # MYD
]
}
PRIORITIES_US = {
'p2': [
DF(DF.month + DF.day),
DF(DF.year + DF.month),
DF(DF.day + DF.month),
DF(DF.month + DF.year),
DF(DF.day + DF.year),
DF(DF.year + DF.day),
],
'p3': [
DF(DF.month + DF.day + DF.year),
DF(DF.year + DF.day + DF.month),
DF(DF.day + DF.month + DF.year),
DF(DF.year + DF.month + DF.day),
DF(DF.day + DF.year + DF.month),
DF(DF.month + DF.year + DF.day),
]
}
def guess_date(tokens, priorities=None):
"""Guess the date from a string in an unknown format.
The method uses the following clues to guess the meaning of each part:
* 4-digits implies it's a year
* 1-digit discards year (since it's more common to write 04 instead of 4 as a shorthand to 2004
* Literal month
* 'of' is preceeded by day and succeeded by the month
* Any of (st, nd, rd, th) on a number makes it a day
* Number > 12 can't be a month
* Number > 31 can't be a day
* Date inexistence (e.g. 29.02.2018)
* precedence DMY > YMD > MDY > YDM
* precedence DM > MY > YM > MD
"""
priorities = PRIORITIES_INT if priorities is None else priorities
# We need to figure out which token corresponds to which component
# (D, M, Y). Since this is ambiguous, guesswork is needed. We do so
# by eliminating impossible options.
# initially, all three components are viable
guesses = [Struct(tok=tok.strip(), fmt=DF.year + DF.month + DF.day) for tok in tokens]
# check indicators for specific formats
for idx in range(len(guesses)):
tok, options = guesses[idx].tok, guesses[idx].fmt
if len(tok) == 1 and tok in '.,;':
# delimiter tokens can be ignored
guesses[idx].fmt = ''
elif tok == 'of':
# an 'of' token indicates a 'day of month' structure
guesses[idx-1].fmt = DF.day
guesses[idx+1].fmt = DF.month
guesses[idx].fmt = ''
elif tok[-2:] in ('st', 'nd', 'rd', 'th'):
# suffix indicates a day
guesses[idx].fmt = DF.day
guesses[idx].tok = tok[:-2]
elif len(tok) == 4 and tok.isdigit():
# four digits must be a year
guesses[idx].fmt = DF.year
elif tok in MONTH_LIT:
# spelled out month is - tadaaa - a month
guesses[idx].tok = str(MONTH_LIT[tok])
guesses[idx].fmt = DF.month
# remove jitter (of, delimiters)
guesses = [itm for itm in guesses if len(itm.fmt) > 0]
# eliminate impossible options
for idx in range(len(guesses)):
tok, options = guesses[idx].tok, guesses[idx].fmt
if len(tok) == 1:
# can't be a year
guesses[idx].fmt = guesses[idx].fmt.replace(DF.year, '')
if tok.isdigit() and int(tok) > 12:
# can't be a month
guesses[idx].fmt = guesses[idx].fmt.replace(DF.month, '')
if tok.isdigit() and int(tok) > 31:
# can't be a day
guesses[idx].fmt = guesses[idx].fmt.replace(DF.day, '')
# define helper function
def create_date(year, month, day):
"""Return a datetime for the given components or None if that is not possible."""
# check format
if DF.year not in year.fmt or DF.month not in month.fmt or DF.day not in day.fmt:
return None
if len(str(year.tok)) == 2:
# ten years into the future is still the current century, otherwise the previous one
threshold = ddate.today().year + 10 - 2000
year = Struct(
tok='20'+str(year.tok) if int(year.tok) < threshold else '19'+str(year.tok),
fmt=year.fmt
)
try:
# create date
return ddate(year=int(year.tok), month=int(month.tok), day=int(day.tok))
except ValueError:
return None
# placeholders for unspecified tokens
pyear = Struct(tok=ddate.today().year, fmt=DF.year)
pday = Struct(tok=1, fmt=DF.day)
pmon = Struct(tok=1, fmt=DF.month)
if len(guesses) == 1: # one-part date (Y)
itm = guesses[0]
date = create_date(itm, pmon, pday)
if date is not None:
return date, DF(DF.year)
else:
raise DateParserError('Two-digit date format must contain the year')
elif len(guesses) == 2: # two-part date (DM, MY, YM, MD)
fst, snd = guesses
# check components
if len(set(fst.fmt + snd.fmt)) < 2:
raise DateParserError('Invalid two-digit date format')
if len(fst.fmt) == 1 and len(snd.fmt) == 1: # fully determined
date = {
DF.year: pyear,
DF.month: pmon,
DF.day: pday,
}
date.update({
fst.fmt: fst,
snd.fmt: snd,
})
return create_date(date[DF.year], date[DF.month], date[DF.day]), DF(fst.fmt + snd.fmt)
# walk through prioritized formats
formats = {
DF(DF.day + DF.month): create_date(pyear, snd, fst), # DM
DF(DF.month + DF.year): create_date(snd, fst, pday), # MY
DF(DF.year + DF.month): create_date(fst, snd, pday), # YM
DF(DF.month + DF.day): create_date(pyear, fst, snd), # MD
DF(DF.day + DF.year): create_date(snd, pmon, fst), # DY
DF(DF.year + DF.day): create_date(fst, pmon, snd), # YD
}
for fmt in priorities['p2']:
if formats.get(fmt, None) is not None:
return formats[fmt], fmt
raise DateParserError('Cannot guess roles of a two-digit date format')
elif len(guesses) == 3: # three-part date (DMY, YMD, MDY, YMD)
# eliminate options based on uniqueness of component assignment
changed = True
while changed:
# resolved guesses: item has only one possible component option
resolved = set([itm.fmt for itm in guesses if len(itm.fmt) == 1])
# single choice: component has only one possible position
unique = {comp for comp, freq in
Counter(flatten([set(itm.fmt) for itm in guesses])).items()
if freq == 1}
# assume no changes
changed = False
for itm in guesses:
if unique & set(itm.fmt) and not set(itm.fmt).issubset(unique):
# itm is the only option for one component
itm.fmt = DF(''.join(unique & set(itm.fmt)))
changed = True
elif resolved & set(itm.fmt) and not set(itm.fmt).issubset(resolved):
# itm contains options that already taken by a different item
itm.fmt = itm.fmt.translate(str.maketrans('', '', ''.join(resolved)))
changed = True
fst, snd, trd = guesses
# check components
if len(set(fst.fmt + snd.fmt + trd.fmt)) < 3:
raise DateParserError('Invalid three-digit date format')
if len(fst.fmt) == 1 and len(snd.fmt) == 1 and len(trd.fmt) == 1: # fully determined
date = {
fst.fmt: fst,
snd.fmt: snd,
trd.fmt: trd,
}
return (create_date(date[DF.year], date[DF.month], date[DF.day]),
DF(fst.fmt + snd.fmt + trd.fmt))
# walk through prioritized formats
formats = {
DF(DF.day + DF.month + DF.year): create_date(year=trd, month=snd, day=fst), # DMY
DF(DF.year + DF.month + DF.day): create_date(year=fst, month=snd, day=trd), # YMD
DF(DF.month + DF.day + DF.year): create_date(year=trd, month=fst, day=snd), # MDY
DF(DF.year + DF.day + DF.month): create_date(year=fst, month=trd, day=snd), # YDM
DF(DF.day + DF.year + DF.month): create_date(year=snd, month=trd, day=fst), # DYM
DF(DF.month + DF.year + DF.day): create_date(year=snd, month=fst, day=trd), # MYD
}
for fmt in priorities['p3']:
if formats.get(fmt, None) is not None:
return formats[fmt], fmt
raise DateParserError('Cannot guess the roles of a three-digit date format')
raise DateParserError('Cannot parse the date format')
def guess_time(tokens):
"""Guess the time from a string in an unknown format.
* Always sorted from hi (hour) to low (sec)
* 4 Terms -> hour, min, sec, ns
* 3 Terms -> hour, min, sec
* 2 Terms -> hour, min | min, sec
* both terms > 24 -> min, sec
* am or pm present -> hour, min
* Dot separation -> min, sec
* Colon separation -> hour, min
"""
# remove spearators
tokens = [tok.lower() for tok in tokens if tok not in '.,:']
# check if the am/pm format was used
is_am = 'am' in tokens
is_pm = 'pm' in tokens
# remove non-numbers
tokens = [tok for tok in tokens if tok.isdigit()]
if not len(tokens):
raise TimeParserError()
# convert to int
ms = int(tokens[-1].ljust(6, '0'))
tokens = [int(tok) for tok in tokens]
# guess format
try:
if len(tokens) == 4: # H:M:S.NS
tokens[-1] = ms
return dtime(*tokens), DF(DF.hour + DF.minute + DF.second + DF.microsecond)
elif len(tokens) == 3: # H:M:S
return dtime(*tokens), DF(DF.hour + DF.minute + DF.second)
elif len(tokens) == 2: # H:M or M:S
if is_am: # am/pm notation was used
return dtime(*tokens), DF(DF.hour + DF.minute)
elif is_pm: # am/pm notation was used
return dtime(tokens[0] + 12, tokens[1]), DF(DF.hour + DF.minute)
elif tokens[0] > 24: # min, sec
return dtime(0, *tokens), DF(DF.minute + DF.second)
else: # hour, sec
return dtime(*tokens), DF(DF.hour + DF.minute)
elif len(tokens) == 1: # H
if is_am: # am/pm notation was used
return dtime(tokens[0]), DF(DF.hour)
elif is_pm: # am/pm notation was used
return dtime(tokens[0] + 12), DF(DF.hour)
else:
return dtime(tokens[0], 0), DF(DF.hour)
except ValueError:
# invalid value was supplied, e.g. hour=85
raise TimeParserError('Invalid value', tokens)
raise TimeParserError('Unknown time format', tokens)
def guess_datetime(exp):
"""Return a datetime instance by guessing the components of a DATETIME parsed
user-supplied date and/or time string. Guessing might be necessary since dates
like 10.11.12 are ambiguous. *exp* is supposed to be a pyparsing.ParseResults
instance as returned by DATETIME.parseString(...).
"""
# For now I assumed unique separators (dot for date, colon for time, comma to separate the two)
if 'date' in exp and 'time' in exp: # both parts present
date, dfmt = guess_date(exp.date)
time, tfmt = guess_time(exp.time)
return datetime.combine(date, time), DF(dfmt+tfmt)
elif 'date' in exp: # date present
date, dfmt = guess_date(exp.date)
return datetime(date.year, date.month, date.day), dfmt
elif 'time' in exp: # time present
time, tfmt = guess_time(exp.time)
return datetime.combine(ddate.fromtimestamp(0), time), tfmt
else:
raise DateFormatError('Neither a date nor a time was found.')
def increment(date, fmt):
"""Increment the LSB of a datetime instance by one."""
if fmt == '' or not fmt.valid():
raise DateFormatError('Invalid date format string', fmt)
elif fmt.lsb() == fmt.microsecond: # 5.11.2012, 06:24:18.25 -> 5.11.2012, 06:25:18.26
return date + relativedelta(microseconds=1)
elif fmt.lsb() == fmt.second: # 5.11.2012, 06:24:18 -> 5.11.2012, 06:25:19
return date + relativedelta(seconds=1, microsecond=0)
elif fmt.lsb() == fmt.minute: # 5.11.2012, 06:24 -> 5.11.2012, 06:25
return date + relativedelta(minutes=1, second=0, microsecond=0)
elif fmt.lsb() == fmt.hour: # 5.11.2012, 06am -> 5.11.2012, 07:00
return date + relativedelta(hours=1, minute=0, second=0, microsecond=0)
elif fmt.lsb() == fmt.day: # 5.11.2012 -> 6.11.2012, 00:00
return date + relativedelta(days=1, hour=0, minute=0, second=0, microsecond=0)
elif fmt.lsb() == fmt.month: # 11.2012 -> 1.12.2012
return date + relativedelta(months=1, day=1, hour=0, minute=0, second=0, microsecond=0)
else: # fmt.lsb() == fmt.year: # 2012 -> 1.1.2013, 00:00
return date + relativedelta(
years=1, month=1, day=1, hour=0, minute=0, second=0, microsecond=0)
class DateTimeParser():
DATE = None
TIME = None
DATETIME = None
def build_parser(self):
"""
DATE := YMD | DMY | MDY | YDM
YMD := YEAR SEP MON SEP DAY
DMY := DAY SEP [of] MON SEP YEAR
MDY := MON SEP DAY SEP YEAR
YDM := YEAR SEP DAY [of] MON
DM := DAY SEP [of] MON
YM := YEAR SEP MON
MY := MON SEP YEAR
MD := MON SEP DAY
DAY := [D]D | [D]D st | [D]D nd | [D]D rd | [D]D th
MON := [M]M | [month]
YEAR := [YY]YY
SEP := . | , | [whitespace]
{D,M,Y} := [digit]
"""
# FIXME: Allow more patterns (e.g. 2012, 10; April, 5th; April, 2020)
sep = Literal('.') # FIXME: Allow '. - :'
year = Word(nums, exact=2) ^ Word(nums, exact=4)
month = Word(nums, min=1, max=2) ^ oneOf(list(MONTH_LIT.keys()))
day = Combine(Word(nums, min=1, max=2) + Optional(oneOf('st nd rd th')))
# three-part-date
YMD = year + sep + month + sep + day
DMY = day + (sep ^ 'of') + month + sep + year
MDY = month + sep + day + sep + year
YDM = year + sep + day + (sep ^ 'of') + month
# two-part-date
DM = day + (sep ^ 'of')+ month
YM = year + sep + month
MY = month + sep + year
MD = month + sep + day
Y = Word(nums, exact=4)
# date parser
self.DATE = Group(YMD | DMY | YDM | MDY | DM | YM | MY | MD | Y).setResultsName('date')
"""
TIME := HOUR SEP MIN [SEP SEC [. MS]] | HOUR SEP MIN | HOUR [SEP MIN] {am|pm}
HOUR := [H]H
MIN := [M]M
SEC := [S]S
{H,M,S} := [digit]
SEP := : | . | ,
"""
sep = Literal(':') # FIXME: Allow '. : -'
HMS = Word(nums, min=1, max=2)
MS = Word(nums, min=1)
# time parser
self.TIME = Group(HMS + sep + HMS + sep + HMS + oneOf('. :') + MS \
| HMS + sep + HMS + sep + HMS \
| HMS + Optional(sep + HMS) + oneOf('am pm') \
| HMS + sep + HMS ).setResultsName('time')
"""
DATETIME := DATE | TIME | DATE SEP TIME | TIME SEP DATE
SEP := , [whitespace]
"""
self.DATETIME = Group(
self.DATE \
^ self.TIME \
^ self.DATE + Optional(',') + self.TIME \
^ self.TIME + Optional(',') + self.DATE \
).setResultsName('datetime')
return self
def __call__(self, datestr):
if self.DATETIME is None:
self.build_parser()
try:
date, fmt = guess_datetime(self.DATETIME.parseString(datestr, parseAll=True)[0])
return date
except ParseException as e:
raise errors.ParserError('Cannot parse query', e)
"""Default DateTimeParser instance.
To produce an datetime, call
>>> parse_datetime(datestring)
Convenience shortcut for
>>> DateTimeParser().parse(datestring)
"""
parse_datetime = DateTimeParser().build_parser()
## EOF ##
|