From e6b04e9b25ebe3fbd934bc0241e2161d01c341c0 Mon Sep 17 00:00:00 2001 From: NateScarlet Date: Fri, 8 Mar 2019 00:11:53 +0800 Subject: [PATCH] Implement parsing (WIP) --- fetch_holidays.py | 160 ++++++++++++++++++++++++++++++++++++++--- test_fetch_holidays.py | 10 ++- 2 files changed, 158 insertions(+), 12 deletions(-) diff --git a/fetch_holidays.py b/fetch_holidays.py index 61434bf..f38696a 100644 --- a/fetch_holidays.py +++ b/fetch_holidays.py @@ -2,7 +2,10 @@ """Fetch holidays from gov.cn """ import argparse +import json import re +from datetime import date, timedelta +from typing import List import bs4 import requests @@ -13,7 +16,16 @@ SEARCH_URL = ('http://sousuo.gov.cn/s.htm' '&pcodeJiguan=%E5%9B%BD%E5%8A%9E%E5%8F%91%E6%98%8E%E7%94%B5') -def get_paper_urls(year): +def get_paper_urls(year: int) -> List[str]: + """Find year related paper urls. + + Args: + year (int): eg. 2018 + + Returns: + List[str]: Urls + """ + url = SEARCH_URL.format(year=year) body = requests.get(url).text ret = re.findall( @@ -45,24 +57,154 @@ def get_rules(paper: str): yield match.groups() -def parse_holiday_description(year, description): - pass +def _cast_int(value): + return int(value) if value else None -def parse_paper(url): - pass +class SentenceParser: + """Parser for rule sentence. """ + + def __init__(self, sentence, year): + self.sentence = sentence + self.year = year + + def extract_dates(self, value) -> List[date]: + memory = set() + for method in self.date_extraction_methods: + for i in method(self, value): + if i not in memory: + memory.add(i) + yield i + + def _extract_dates_1(self, value): + match = re.match(r'(?:(\d+)年)?(?:(\d+)月)(\d+)日', value) + if match: + groups = [_cast_int(i) for i in match.groups()] + assert len(groups) == 3, groups + yield date(year=groups[0] or self.year, + month=groups[1], day=groups[2]) + + def _extract_dates_2(self, value): + match = re.match( + r'(?:(\d+)年)?(?:(\d+)月)(\d+)日(?:至|-)(?:(\d+)年)?(?:(\d+)月)?(\d+)日', value) + if match: + groups = [_cast_int(i) for i in match.groups()] + assert len(groups) == 6, groups + start = date(year=groups[0] or self.year, + month=groups[1], day=groups[2]) + end = date(year=groups[3] or self.year, + month=groups[4] or groups[1], day=groups[5]) + for i in range((end - start).days + 1): + yield start + timedelta(days=i) + + def _extract_dates_3(self, value): + match = re.match( + r'(?:(\d+)年)?(?:(\d+)月)(\d+)日(?:([^)]+))?(?:、(?:(\d+)年)?(?:(\d+)月)?(\d+)日(?:([^)]+))?)+', value) + if match: + groups = [_cast_int(i) for i in match.groups()] + assert not (len(groups) % 3), groups + year = self.year + month = None + day = None + for i in range(0, len(groups), 3): + year = groups[i] or year + month = groups[i+1] or month + day = groups[i+2] + assert year + assert month + assert day + yield date(year=year, month=month, day=day) + + date_extraction_methods = [ + _extract_dates_1, + _extract_dates_2, + _extract_dates_3 + ] + + def parse(self): + date_memory = set() + for method in self.parsing_methods: + + for i in method(self): + if i['date'] in date_memory: + continue + date_memory.add(i['date']) + yield i + + def _parse_rest_1(self): + match = re.match('(.+)放假(调休)?$', self.sentence) + if match: + for i in self.extract_dates(match.group(1)): + yield { + 'date': i, + 'isOffDay': True + } + + def _parse_work_1(self): + match = re.match('(.+)上班$', self.sentence) + if match: + for i in self.extract_dates(match.group(1)): + yield { + 'date': i, + 'isOffDay': False + } + + def _parse_work_2(self): + match = re.match('(.+)公休日调至(.+)', self.sentence) + if match: + for i in self.extract_dates(match.group(1)): + yield { + 'date': i, + 'isOffDay': False + } + for i in self.extract_dates(match.group(2)): + yield { + 'date': i, + 'isOffDay': True + } + + parsing_methods = [ + _parse_rest_1, + _parse_work_1, + _parse_work_2, + ] + + +def parse_holiday_description(description: str, year: int): + for i in re.split(',|。', description): + for j in SentenceParser(i, year).parse(): + yield j def main(): parser = argparse.ArgumentParser() - parser.add_argument('year') + parser.add_argument('year', type=int) args = parser.parse_args() + year = args.year + papers = get_paper_urls(year) - papers = get_paper_urls(args.year) - + ret = [] for i in papers: paper = get_paper(i) - [print(i) for i in get_rules(paper)] + rules = get_rules(paper) + for name, description in rules: + ret.extend({ + 'name': name, + **j + } for j in parse_holiday_description(description, year)) + + print(json.dumps(ret, indent=4, ensure_ascii=False, cls=CustomJSONEncoder)) + + +class CustomJSONEncoder(json.JSONEncoder): + """Custom json encoder. """ + + def default(self, o): + # pylint:disable=method-hidden + if isinstance(o, date): + return o.isoformat() + + return super().default(o) if __name__ == '__main__': diff --git a/test_fetch_holidays.py b/test_fetch_holidays.py index c2c20c4..60e3acb 100644 --- a/test_fetch_holidays.py +++ b/test_fetch_holidays.py @@ -2,7 +2,11 @@ import json import sys -from fetch_holidays import parse_holiday_description +from fetch_holidays import CustomJSONEncoder, parse_holiday_description + + +def _normalize(iterable): + return sorted(json.loads(json.dumps(list(iterable), cls=CustomJSONEncoder)), key=lambda x: x['date']) def _generate_tests(): @@ -12,8 +16,8 @@ def _generate_tests(): def create_test(case): def _test(): year, description, expected = case['year'], case['description'], case['expected'] - assert parse_holiday_description( - year, description) == expected, case + assert _normalize(parse_holiday_description( + description, year)) == _normalize(expected), case return _test for index, case in enumerate(cases, 1):