From 73becdc0c94744cdf4c20377cbca808c70d62565 Mon Sep 17 00:00:00 2001 From: NateScarlet Date: Sat, 9 Mar 2019 19:35:24 +0800 Subject: [PATCH] Fix missiong days in 2007, 2011 --- 2007.json | 10 ++ 2011.json | 5 + description_parsing_cases.json | 56 +++++++++++ fetch_holidays.py | 165 +++++++++++++++++---------------- test_fetch_holidays.py | 4 +- 5 files changed, 158 insertions(+), 82 deletions(-) diff --git a/2007.json b/2007.json index 3add6d1..f96b374 100644 --- a/2007.json +++ b/2007.json @@ -29,6 +29,11 @@ "date": "2007-01-03", "isOffDay": true }, + { + "name": "春节", + "date": "2007-02-17", + "isOffDay": false + }, { "name": "春节", "date": "2007-02-18", @@ -64,6 +69,11 @@ "date": "2007-02-24", "isOffDay": true }, + { + "name": "春节", + "date": "2007-02-25", + "isOffDay": false + }, { "name": "“五一”", "date": "2007-04-28", diff --git a/2011.json b/2011.json index 44b495b..4f84061 100644 --- a/2011.json +++ b/2011.json @@ -29,6 +29,11 @@ "date": "2011-02-02", "isOffDay": true }, + { + "name": "春节", + "date": "2011-02-08", + "isOffDay": true + }, { "name": "春节", "date": "2011-02-12", diff --git a/description_parsing_cases.json b/description_parsing_cases.json index 16bee30..d21881c 100644 --- a/description_parsing_cases.json +++ b/description_parsing_cases.json @@ -422,5 +422,61 @@ { "date": "2008-09-27", "isOffDay": false }, { "date": "2008-09-28", "isOffDay": false } ] + }, + { + "year": 2007, + "description": "1月1日—3日放假,共3天。其中1月1日为法定假日,将2006年12月30日(星期六)、31日(星期日)两个公休日分别调至2007年1月2日、3日,2006年12月30日(星期六)、31日(星期日)上班。", + "expected": [ + { "date": "2007-01-01", "isOffDay": true }, + { "date": "2007-01-02", "isOffDay": true }, + { "date": "2007-01-03", "isOffDay": true }, + { "date": "2006-12-30", "isOffDay": false }, + { "date": "2006-12-31", "isOffDay": false } + ] + }, + { + "year": 2007, + "description": "2月18日—24日(即农历大年初一至初七)放假,共7天。其中18日、19日、20日为法定假日,将17日(星期六)、18日(星期日)、25日(星期日)三个公休日分别调至21日(星期三)、22日(星期四)、23日(星期五);24日(星期六)照常公休,17日、25日上班。", + "expected": [ + { "date": "2007-02-18", "isOffDay": true }, + { "date": "2007-02-19", "isOffDay": true }, + { "date": "2007-02-20", "isOffDay": true }, + { "date": "2007-02-21", "isOffDay": true }, + { "date": "2007-02-22", "isOffDay": true }, + { "date": "2007-02-23", "isOffDay": true }, + { "date": "2007-02-24", "isOffDay": true }, + { "date": "2007-02-17", "isOffDay": false }, + { "date": "2007-02-25", "isOffDay": false } + ] + }, + { + "year": 2007, + "description": "5月1日—7日放假,共7天。其中,1日、2日、3日为法定假日,将4月28日(星期六)、29日(星期日)两个公休日调至5月4日(星期五)、7日(星期一);5月5日(星期六)、6日(星期日)照常公休,4月28日、29日上班。", + "expected": [ + { "date": "2007-05-01", "isOffDay": true }, + { "date": "2007-05-02", "isOffDay": true }, + { "date": "2007-05-03", "isOffDay": true }, + { "date": "2007-05-04", "isOffDay": true }, + { "date": "2007-05-05", "isOffDay": true }, + { "date": "2007-05-06", "isOffDay": true }, + { "date": "2007-05-07", "isOffDay": true }, + { "date": "2007-04-28", "isOffDay": false }, + { "date": "2007-04-29", "isOffDay": false } + ] + }, + { + "year": 2007, + "description": "10月1日—7日放假,共7天。其中,1日、2日、3日为法定假日,将9月29日(星期六)、30日(星期日)两个公休日调至10月4日(星期四)、5日(星期五);10月6日(星期六)、7日(星期日)照常公休,9月29日、30日上班。", + "expected": [ + { "date": "2007-10-01", "isOffDay": true }, + { "date": "2007-10-02", "isOffDay": true }, + { "date": "2007-10-03", "isOffDay": true }, + { "date": "2007-10-04", "isOffDay": true }, + { "date": "2007-10-05", "isOffDay": true }, + { "date": "2007-10-06", "isOffDay": true }, + { "date": "2007-10-07", "isOffDay": true }, + { "date": "2007-09-29", "isOffDay": false }, + { "date": "2007-09-30", "isOffDay": false } + ] } ] diff --git a/fetch_holidays.py b/fetch_holidays.py index 7e9095f..24e6542 100755 --- a/fetch_holidays.py +++ b/fetch_holidays.py @@ -86,13 +86,70 @@ def _cast_int(value): return int(value) if value else None +class DescriptionParser: + """Parser for holiday shift description. """ + + def __init__(self, description: str, year: int): + self.description = description + self.year = year + self.date_history = list() + + def memorize_date(self, value: date): + self.date_history.append(value) + + def clear_memory(self): + del self.date_history[:] + + def parse(self) -> Iterator[dict]: + """Generator for description parsing result. + + Args: + year (int): Context year + """ + + self.clear_memory() + for i in re.split('[,。;]', self.description): + for j in SentenceParser(self, i).parse(): + yield j + + if not self.date_history: + raise NotImplementedError(self.description) + + def get_date(self, year: Optional[int], month: Optional[int], day: int) -> date: + """Get date in context. + + Args: + year (Optional[int]): year + month (int): month + day (int): day + + Returns: + date: Date result + """ + + assert day, 'No day specified' + + # Special case: month inherit + if month is None: + month = self.date_history[-1].month + + # Special case: 12 month may mean previous year + if (year is None + and month == 12 + and self.date_history + and max(self.date_history) < date(year=self.year, month=2, day=1)): + year = self.year - 1 + + year = year or self.year + return date(year=year, month=month, day=day) + + class SentenceParser: """Parser for holiday shift description sentence. """ - def __init__(self, sentence, year): + def __init__(self, parent: DescriptionParser, sentence): + self.parent = parent self.sentence = sentence - self.year = year - self._date_memory = set() def extract_dates(self, text: str) -> Iterator[date]: """Extract date from text. @@ -105,77 +162,49 @@ class SentenceParser: """ count = 0 + text = text.replace('(', '(').replace(')', ')') for method in self.date_extraction_methods: for i in method(self, text): count += 1 - if i in self._date_memory: + is_seen = i in self.parent.date_history + self.parent.memorize_date(i) + if is_seen: continue - self._date_memory.add(i) yield i if not count: raise NotImplementedError(text) - def get_date(self, year: Optional[int], month: int, day: int) -> date: - """Get date in context. - - Args: - year (Optional[int]): year - month (int): month - day (int): day - - Returns: - date: Date result - """ - - # Special case: 12 month may mean previous year - if (year is None - and month == 12 - and self._date_memory - and max(self._date_memory) < date(year=self.year, month=2, day=1)): - year = self.year - 1 - - year = year or self.year - return date(year=year, month=month, day=day) - def _extract_dates_1(self, value): - match = re.match(r'(?:(\d+)年)?(?:(\d+)月)(\d+)日', value) - if match: - groups = [_cast_int(i) for i in match.groups()] + match = re.findall(r'(?:(\d+)年)?(?:(\d+)月)?(\d+)日', value) + for groups in match: + groups = [_cast_int(i) for i in groups] assert len(groups) == 3, groups - yield self.get_date(year=groups[0], month=groups[1], day=groups[2]) + yield self.parent.get_date(year=groups[0], month=groups[1], day=groups[2]) def _extract_dates_2(self, value): - match = re.match( - r'(?:(\d+)年)?(?:(\d+)月)(\d+)日(?:至|-|—)(?:(\d+)年)?(?:(\d+)月)?(\d+)日', value) - if match: - groups = [_cast_int(i) for i in match.groups()] + match = re.findall( + r'(?:(\d+)年)?(?:(\d+)月)?(\d+)日(?:至|-|—)(?:(\d+)年)?(?:(\d+)月)?(\d+)日', value) + for groups in match: + groups = [_cast_int(i) for i in groups] assert len(groups) == 6, groups - start = self.get_date(year=groups[0], - month=groups[1], day=groups[2]) - end = self.get_date(year=groups[3], - month=groups[4] or groups[1], day=groups[5]) + start = self.parent.get_date(year=groups[0], + month=groups[1], day=groups[2]) + end = self.parent.get_date(year=groups[3], + month=groups[4], day=groups[5]) for i in range((end - start).days + 1): yield start + timedelta(days=i) def _extract_dates_3(self, value): - match = re.match( - r'(?:(\d+)年)?(?:(\d+)月)(\d+)日(?:([^)]+))?' + match = re.findall( + r'(?:(\d+)年)?(?:(\d+)月)?(\d+)日(?:([^)]+))?' r'(?:、(?:(\d+)年)?(?:(\d+)月)?(\d+)日(?:([^)]+))?)+', - value.replace('(', '(').replace(')', ')')) - if match: - groups = [_cast_int(i) for i in match.groups()] + value) + for groups in match: + groups = [_cast_int(i) for i in groups] assert not (len(groups) % 3), groups - year = self.year - month = None - day = None for i in range(0, len(groups), 3): - year = groups[i] - month = groups[i+1] or month - day = groups[i+2] - assert month - assert day - yield self.get_date(year=year, month=month, day=day) + yield self.parent.get_date(year=groups[i], month=groups[i+1], day=groups[i+2]) date_extraction_methods = [ _extract_dates_1, @@ -183,7 +212,7 @@ class SentenceParser: _extract_dates_3 ] - def parse(self, memory: set) -> Iterator[dict]: + def parse(self) -> Iterator[dict]: """Parse days with memory Args: @@ -193,7 +222,6 @@ class SentenceParser: Iterator[dict]: Days without name field. """ - self._date_memory = memory for method in self.parsing_methods: for i in method(self): yield i @@ -217,7 +245,7 @@ class SentenceParser: } def _parse_shift_1(self): - match = re.match('(.+)公休日调至(.+)', self.sentence) + match = re.match('(.+)调至(.+)', self.sentence) if match: for i in self.extract_dates(match.group(1)): yield { @@ -237,29 +265,6 @@ class SentenceParser: ] -class DescriptionParser: - """Parser for holiday shift description. """ - - def __init__(self, description): - self.description = description - self._date_memory = set() - - def parse(self, year: int) -> Iterator[dict]: - """Generator for description parsing result. - - Args: - year (int): Context year - """ - - self._date_memory.clear() - for i in re.split(',|。', self.description): - for j in SentenceParser(i, year).parse(self._date_memory): - yield j - - if not self._date_memory: - raise NotImplementedError(self.description) - - def fetch_holiday(year: int): """Fetch holiday data. """ @@ -273,7 +278,7 @@ def fetch_holiday(year: int): days.extend({ 'name': name, **j - } for j in DescriptionParser(description).parse(year)) + } for j in DescriptionParser(description, year).parse()) return { 'year': year, 'papers': papers, diff --git a/test_fetch_holidays.py b/test_fetch_holidays.py index b261971..84c6ece 100644 --- a/test_fetch_holidays.py +++ b/test_fetch_holidays.py @@ -17,8 +17,8 @@ def _generate_tests(): def create_test(case): def _test(): year, description, expected = case['year'], case['description'], case['expected'] - assert _normalize(DescriptionParser(description) - .parse(year)) == _normalize(expected), case + assert _normalize(DescriptionParser( + description, year).parse()) == _normalize(expected), case return _test for index, case in enumerate(cases, 1):