diff --git a/fetch_holidays.py b/fetch_holidays.py index 289a42c..da8bcb9 100755 --- a/fetch_holidays.py +++ b/fetch_holidays.py @@ -11,10 +11,10 @@ from typing import Iterator, List, Optional, Tuple import bs4 import requests -SEARCH_URL = 'http://sousuo.gov.cn/s.htm' +SEARCH_URL = "http://sousuo.gov.cn/s.htm" PAPER_EXCLUDE = [ - 'http://www.gov.cn/zhengce/content/2014-09/29/content_9102.htm', - 'http://www.gov.cn/zhengce/content/2015-02/09/content_9466.htm', + "http://www.gov.cn/zhengce/content/2014-09/29/content_9102.htm", + "http://www.gov.cn/zhengce/content/2015-02/09/content_9466.htm", ] @@ -28,16 +28,20 @@ def get_paper_urls(year: int) -> List[str]: List[str]: Urls, newlest first. """ - body = requests.get(SEARCH_URL, params={ - 't': 'paper', - 'advance': 'true', - 'title': year, - 'q': '假期', - 'pcodeJiguan': '国办发明电', - 'puborg': '国务院办公厅' - }).text + body = requests.get( + SEARCH_URL, + params={ + "t": "paper", + "advance": "true", + "title": year, + "q": "假期", + "pcodeJiguan": "国办发明电", + "puborg": "国务院办公厅", + }, + ).text ret = re.findall( - r'
  • ', body, flags=re.S) + r'
  • ', body, flags=re.S + ) ret = [i for i in ret if i not in PAPER_EXCLUDE] ret.sort() return ret @@ -53,16 +57,17 @@ def get_paper(url: str) -> str: str: Extracted paper text. """ - assert re.match(r'http://www.gov.cn/zhengce/content/\d{4}-\d{2}/\d{2}/content_\d+.htm', - url), 'Site changed, need human verify' + assert re.match( + r"http://www.gov.cn/zhengce/content/\d{4}-\d{2}/\d{2}/content_\d+.htm", url + ), "Site changed, need human verify" response = requests.get(url) - response.encoding = 'utf-8' - soup = bs4.BeautifulSoup(response.text, features='html.parser') - container = soup.find('td', class_='b12c') - assert container, f'Can not get paper container from url: {url}' - ret = container.get_text().replace('\u3000\u3000', '\n') - assert ret, f'Can not get paper context from url: {url}' + response.encoding = "utf-8" + soup = bs4.BeautifulSoup(response.text, features="html.parser") + container = soup.find("td", class_="b12c") + assert container, f"Can not get paper container from url: {url}" + ret = container.get_text().replace("\u3000\u3000", "\n") + assert ret, f"Can not get paper context from url: {url}" return ret @@ -99,7 +104,7 @@ def get_normal_rules(lines: Iterator[str]) -> Iterator[Tuple[str, str]]: Iterator[Tuple[str, str]]: (name, description) """ for i in lines: - match = re.match(r'[一二三四五六七八九十]、(.+?):(.+)', i) + match = re.match(r"[一二三四五六七八九十]、(.+?):(.+)", i) if match: yield match.groups() @@ -115,16 +120,16 @@ def get_patch_rules(lines: Iterator[str]) -> Iterator[Tuple[str, str]]: """ name = None for i in lines: - match = re.match(r'.*\d+年([^和、]{2,})(?:假期|放假).*安排', i) + match = re.match(r".*\d+年([^和、]{2,})(?:假期|放假).*安排", i) if match: name = match.group(1) if not name: continue - match = re.match(r'^[一二三四五六七八九十]、(.+)$', i) + match = re.match(r"^[一二三四五六七八九十]、(.+)$", i) if not match: continue description = match.group(1) - if re.match(r'.*\d+月\d+日.*', description): + if re.match(r".*\d+月\d+日.*", description): yield name, description @@ -133,7 +138,7 @@ def _cast_int(value): class DescriptionParser: - """Parser for holiday shift description. """ + """Parser for holiday shift description.""" def __init__(self, description: str, year: int): self.description = description @@ -148,7 +153,7 @@ class DescriptionParser: """ del self.date_history[:] - for i in re.split('[,。;]', self.description): + for i in re.split("[,。;]", self.description): for j in SentenceParser(self, i).parse(): yield j @@ -167,17 +172,19 @@ class DescriptionParser: date: Date result """ - assert day, 'No day specified' + assert day, "No day specified" # Special case: month inherit if month is None: month = self.date_history[-1].month # Special case: 12 month may mean previous year - if (year is None - and month == 12 - and self.date_history - and max(self.date_history) < date(year=self.year, month=2, day=1)): + if ( + year is None + and month == 12 + and self.date_history + and max(self.date_history) < date(year=self.year, month=2, day=1) + ): year = self.year - 1 year = year or self.year @@ -185,10 +192,10 @@ class DescriptionParser: class SentenceParser: - """Parser for holiday shift description sentence. """ + """Parser for holiday shift description sentence.""" special_cases = { - '延长2020年春节假期至2月2日(农历正月初九': [ + "延长2020年春节假期至2月2日(农历正月初九": [ {"date": date(2020, 1, 31), "isOffDay": True}, {"date": date(2020, 2, 1), "isOffDay": True}, {"date": date(2020, 2, 2), "isOffDay": True}, @@ -210,8 +217,10 @@ class SentenceParser: """ count = 0 - text = text.replace('(', '(').replace(')', ')') - for i in chain(*(method(self, text) for method in self.date_extraction_methods)): + text = text.replace("(", "(").replace(")", ")") + for i in chain( + *(method(self, text) for method in self.date_extraction_methods) + ): count += 1 is_seen = i in self.parent.date_history self.parent.date_history.append(i) @@ -223,7 +232,7 @@ class SentenceParser: raise NotImplementedError(text) def _extract_dates_1(self, value: str) -> Iterator[date]: - match = re.findall(r'(?:(\d+)年)?(?:(\d+)月)?(\d+)日', value) + match = re.findall(r"(?:(\d+)年)?(?:(\d+)月)?(\d+)日", value) for groups in match: groups = [_cast_int(i) for i in groups] assert len(groups) == 3, groups @@ -231,33 +240,31 @@ class SentenceParser: def _extract_dates_2(self, value: str) -> Iterator[date]: match = re.findall( - r'(?:(\d+)年)?(?:(\d+)月)?(\d+)日(?:至|-|—)(?:(\d+)年)?(?:(\d+)月)?(\d+)日', value) + r"(?:(\d+)年)?(?:(\d+)月)?(\d+)日(?:至|-|—)(?:(\d+)年)?(?:(\d+)月)?(\d+)日", value + ) for groups in match: groups = [_cast_int(i) for i in groups] assert len(groups) == 6, groups - start = self.parent.get_date(year=groups[0], - month=groups[1], day=groups[2]) - end = self.parent.get_date(year=groups[3], - month=groups[4], day=groups[5]) + start = self.parent.get_date(year=groups[0], month=groups[1], day=groups[2]) + end = self.parent.get_date(year=groups[3], month=groups[4], day=groups[5]) for i in range((end - start).days + 1): yield start + timedelta(days=i) def _extract_dates_3(self, value: str) -> Iterator[date]: match = re.findall( - r'(?:(\d+)年)?(?:(\d+)月)?(\d+)日(?:([^)]+))?' - r'(?:、(?:(\d+)年)?(?:(\d+)月)?(\d+)日(?:([^)]+))?)+', - value) + r"(?:(\d+)年)?(?:(\d+)月)?(\d+)日(?:([^)]+))?" + r"(?:、(?:(\d+)年)?(?:(\d+)月)?(\d+)日(?:([^)]+))?)+", + value, + ) for groups in match: groups = [_cast_int(i) for i in groups] assert not (len(groups) % 3), groups for i in range(0, len(groups), 3): - yield self.parent.get_date(year=groups[i], month=groups[i+1], day=groups[i+2]) + yield self.parent.get_date( + year=groups[i], month=groups[i + 1], day=groups[i + 2] + ) - date_extraction_methods = [ - _extract_dates_1, - _extract_dates_2, - _extract_dates_3 - ] + date_extraction_methods = [_extract_dates_1, _extract_dates_2, _extract_dates_3] def parse(self) -> Iterator[dict]: """Parse days with memory @@ -273,36 +280,24 @@ class SentenceParser: yield i def _parse_rest_1(self): - match = re.match(r'(.+)(放假|补休|调休|公休)+(?:\d+天)?$', self.sentence) + match = re.match(r"(.+)(放假|补休|调休|公休)+(?:\d+天)?$", self.sentence) if match: for i in self.extract_dates(match.group(1)): - yield { - 'date': i, - 'isOffDay': True - } + yield {"date": i, "isOffDay": True} def _parse_work_1(self): - match = re.match('(.+)上班$', self.sentence) + match = re.match("(.+)上班$", self.sentence) if match: for i in self.extract_dates(match.group(1)): - yield { - 'date': i, - 'isOffDay': False - } + yield {"date": i, "isOffDay": False} def _parse_shift_1(self): - match = re.match('(.+)调至(.+)', self.sentence) + match = re.match("(.+)调至(.+)", self.sentence) if match: for i in self.extract_dates(match.group(1)): - yield { - 'date': i, - 'isOffDay': False - } + yield {"date": i, "isOffDay": False} for i in self.extract_dates(match.group(2)): - yield { - 'date': i, - 'isOffDay': True - } + yield {"date": i, "isOffDay": True} def _parse_special(self): for i in self.special_cases.get(self.sentence, []): @@ -328,49 +323,50 @@ def parse_paper(year: int, url: str) -> Iterator[dict]: """ paper = get_paper(url) rules = get_rules(paper) - ret = ({'name': name, **i} - for name, description in rules - for i in DescriptionParser(description, year).parse()) + ret = ( + {"name": name, **i} + for name, description in rules + for i in DescriptionParser(description, year).parse() + ) try: for i in ret: yield i except NotImplementedError as ex: - raise RuntimeError('Can not parse paper', url) from ex + raise RuntimeError("Can not parse paper", url) from ex def fetch_holiday(year: int): - """Fetch holiday data. """ + """Fetch holiday data.""" papers = get_paper_urls(year) days = dict() - for k in (j - for i in papers - for j in parse_paper(year, i)): - days[k['date']] = k + for k in (j for i in papers for j in parse_paper(year, i)): + days[k["date"]] = k return { - 'year': year, - 'papers': papers, - 'days': sorted(days.values(), key=lambda x: x['date']) + "year": year, + "papers": papers, + "days": sorted(days.values(), key=lambda x: x["date"]), } def main(): parser = argparse.ArgumentParser() - parser.add_argument('year', type=int) + parser.add_argument("year", type=int) args = parser.parse_args() year = args.year - print(json.dumps(fetch_holiday(year), - indent=4, - ensure_ascii=False, - cls=CustomJSONEncoder)) + print( + json.dumps( + fetch_holiday(year), indent=4, ensure_ascii=False, cls=CustomJSONEncoder + ) + ) class CustomJSONEncoder(json.JSONEncoder): - """Custom json encoder. """ + """Custom json encoder.""" def default(self, o): # pylint:disable=method-hidden @@ -380,5 +376,5 @@ class CustomJSONEncoder(json.JSONEncoder): return super().default(o) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/tests/filetools.py b/tests/filetools.py index 31de6d6..1d1437b 100644 --- a/tests/filetools.py +++ b/tests/filetools.py @@ -1,6 +1,7 @@ """Tools for files. """ import os + __dirname__ = os.path.abspath(os.path.dirname(__file__)) diff --git a/tests/test_fetch_holidays.py b/tests/test_fetch_holidays.py index c66e130..3161b1f 100644 --- a/tests/test_fetch_holidays.py +++ b/tests/test_fetch_holidays.py @@ -3,40 +3,54 @@ import json import pytest -from fetch_holidays import (CustomJSONEncoder, DescriptionParser, get_paper, - get_paper_urls, get_rules) +from fetch_holidays import ( + CustomJSONEncoder, + DescriptionParser, + get_paper, + get_paper_urls, + get_rules, +) from .filetools import _file_path def test_get_paper_urls(): assert get_paper_urls(2019) == [ - 'http://www.gov.cn/zhengce/content/2018-12/06/content_5346276.htm', - 'http://www.gov.cn/zhengce/content/2019-03/22/content_5375877.htm', + "http://www.gov.cn/zhengce/content/2018-12/06/content_5346276.htm", + "http://www.gov.cn/zhengce/content/2019-03/22/content_5375877.htm", ] def test_get_rules(): - assert ( - list(get_rules(get_paper( - 'http://www.gov.cn/zhengce/content/2019-03/22/content_5375877.htm'))) - == [('劳动节', - '2019年5月1日至4日放假调休,共4天。4月28日(星期日)、5月5日(星期日)上班。')]) + assert list( + get_rules( + get_paper( + "http://www.gov.cn/zhengce/content/2019-03/22/content_5375877.htm" + ) + ) + ) == [("劳动节", "2019年5月1日至4日放假调休,共4天。4月28日(星期日)、5月5日(星期日)上班。")] def _normalize(iterable): - return sorted(json.loads(json.dumps(list(iterable), cls=CustomJSONEncoder)), - key=lambda x: x['date']) + return sorted( + json.loads(json.dumps(list(iterable), cls=CustomJSONEncoder)), + key=lambda x: x["date"], + ) def _description_parsing_cases(): - with open(_file_path('description_parsing_cases.json'), 'r', encoding='utf-8', ) as f: + with open( + _file_path("description_parsing_cases.json"), + "r", + encoding="utf-8", + ) as f: return json.load(f) -@pytest.mark.parametrize('case', _description_parsing_cases()) +@pytest.mark.parametrize("case", _description_parsing_cases()) def test_parse_description(case): - year, description, expected = case['year'], case['description'], case['expected'] - assert _normalize(DescriptionParser( - description, year).parse()) == _normalize(expected), case + year, description, expected = case["year"], case["description"], case["expected"] + assert _normalize(DescriptionParser(description, year).parse()) == _normalize( + expected + ), case