diff --git a/fetch_holidays.py b/fetch_holidays.py
index 289a42c..da8bcb9 100755
--- a/fetch_holidays.py
+++ b/fetch_holidays.py
@@ -11,10 +11,10 @@ from typing import Iterator, List, Optional, Tuple
import bs4
import requests
-SEARCH_URL = 'http://sousuo.gov.cn/s.htm'
+SEARCH_URL = "http://sousuo.gov.cn/s.htm"
PAPER_EXCLUDE = [
- 'http://www.gov.cn/zhengce/content/2014-09/29/content_9102.htm',
- 'http://www.gov.cn/zhengce/content/2015-02/09/content_9466.htm',
+ "http://www.gov.cn/zhengce/content/2014-09/29/content_9102.htm",
+ "http://www.gov.cn/zhengce/content/2015-02/09/content_9466.htm",
]
@@ -28,16 +28,20 @@ def get_paper_urls(year: int) -> List[str]:
List[str]: Urls, newlest first.
"""
- body = requests.get(SEARCH_URL, params={
- 't': 'paper',
- 'advance': 'true',
- 'title': year,
- 'q': '假期',
- 'pcodeJiguan': '国办发明电',
- 'puborg': '国务院办公厅'
- }).text
+ body = requests.get(
+ SEARCH_URL,
+ params={
+ "t": "paper",
+ "advance": "true",
+ "title": year,
+ "q": "假期",
+ "pcodeJiguan": "国办发明电",
+ "puborg": "国务院办公厅",
+ },
+ ).text
ret = re.findall(
- r'
', body, flags=re.S)
+ r'', body, flags=re.S
+ )
ret = [i for i in ret if i not in PAPER_EXCLUDE]
ret.sort()
return ret
@@ -53,16 +57,17 @@ def get_paper(url: str) -> str:
str: Extracted paper text.
"""
- assert re.match(r'http://www.gov.cn/zhengce/content/\d{4}-\d{2}/\d{2}/content_\d+.htm',
- url), 'Site changed, need human verify'
+ assert re.match(
+ r"http://www.gov.cn/zhengce/content/\d{4}-\d{2}/\d{2}/content_\d+.htm", url
+ ), "Site changed, need human verify"
response = requests.get(url)
- response.encoding = 'utf-8'
- soup = bs4.BeautifulSoup(response.text, features='html.parser')
- container = soup.find('td', class_='b12c')
- assert container, f'Can not get paper container from url: {url}'
- ret = container.get_text().replace('\u3000\u3000', '\n')
- assert ret, f'Can not get paper context from url: {url}'
+ response.encoding = "utf-8"
+ soup = bs4.BeautifulSoup(response.text, features="html.parser")
+ container = soup.find("td", class_="b12c")
+ assert container, f"Can not get paper container from url: {url}"
+ ret = container.get_text().replace("\u3000\u3000", "\n")
+ assert ret, f"Can not get paper context from url: {url}"
return ret
@@ -99,7 +104,7 @@ def get_normal_rules(lines: Iterator[str]) -> Iterator[Tuple[str, str]]:
Iterator[Tuple[str, str]]: (name, description)
"""
for i in lines:
- match = re.match(r'[一二三四五六七八九十]、(.+?):(.+)', i)
+ match = re.match(r"[一二三四五六七八九十]、(.+?):(.+)", i)
if match:
yield match.groups()
@@ -115,16 +120,16 @@ def get_patch_rules(lines: Iterator[str]) -> Iterator[Tuple[str, str]]:
"""
name = None
for i in lines:
- match = re.match(r'.*\d+年([^和、]{2,})(?:假期|放假).*安排', i)
+ match = re.match(r".*\d+年([^和、]{2,})(?:假期|放假).*安排", i)
if match:
name = match.group(1)
if not name:
continue
- match = re.match(r'^[一二三四五六七八九十]、(.+)$', i)
+ match = re.match(r"^[一二三四五六七八九十]、(.+)$", i)
if not match:
continue
description = match.group(1)
- if re.match(r'.*\d+月\d+日.*', description):
+ if re.match(r".*\d+月\d+日.*", description):
yield name, description
@@ -133,7 +138,7 @@ def _cast_int(value):
class DescriptionParser:
- """Parser for holiday shift description. """
+ """Parser for holiday shift description."""
def __init__(self, description: str, year: int):
self.description = description
@@ -148,7 +153,7 @@ class DescriptionParser:
"""
del self.date_history[:]
- for i in re.split('[,。;]', self.description):
+ for i in re.split("[,。;]", self.description):
for j in SentenceParser(self, i).parse():
yield j
@@ -167,17 +172,19 @@ class DescriptionParser:
date: Date result
"""
- assert day, 'No day specified'
+ assert day, "No day specified"
# Special case: month inherit
if month is None:
month = self.date_history[-1].month
# Special case: 12 month may mean previous year
- if (year is None
- and month == 12
- and self.date_history
- and max(self.date_history) < date(year=self.year, month=2, day=1)):
+ if (
+ year is None
+ and month == 12
+ and self.date_history
+ and max(self.date_history) < date(year=self.year, month=2, day=1)
+ ):
year = self.year - 1
year = year or self.year
@@ -185,10 +192,10 @@ class DescriptionParser:
class SentenceParser:
- """Parser for holiday shift description sentence. """
+ """Parser for holiday shift description sentence."""
special_cases = {
- '延长2020年春节假期至2月2日(农历正月初九': [
+ "延长2020年春节假期至2月2日(农历正月初九": [
{"date": date(2020, 1, 31), "isOffDay": True},
{"date": date(2020, 2, 1), "isOffDay": True},
{"date": date(2020, 2, 2), "isOffDay": True},
@@ -210,8 +217,10 @@ class SentenceParser:
"""
count = 0
- text = text.replace('(', '(').replace(')', ')')
- for i in chain(*(method(self, text) for method in self.date_extraction_methods)):
+ text = text.replace("(", "(").replace(")", ")")
+ for i in chain(
+ *(method(self, text) for method in self.date_extraction_methods)
+ ):
count += 1
is_seen = i in self.parent.date_history
self.parent.date_history.append(i)
@@ -223,7 +232,7 @@ class SentenceParser:
raise NotImplementedError(text)
def _extract_dates_1(self, value: str) -> Iterator[date]:
- match = re.findall(r'(?:(\d+)年)?(?:(\d+)月)?(\d+)日', value)
+ match = re.findall(r"(?:(\d+)年)?(?:(\d+)月)?(\d+)日", value)
for groups in match:
groups = [_cast_int(i) for i in groups]
assert len(groups) == 3, groups
@@ -231,33 +240,31 @@ class SentenceParser:
def _extract_dates_2(self, value: str) -> Iterator[date]:
match = re.findall(
- r'(?:(\d+)年)?(?:(\d+)月)?(\d+)日(?:至|-|—)(?:(\d+)年)?(?:(\d+)月)?(\d+)日', value)
+ r"(?:(\d+)年)?(?:(\d+)月)?(\d+)日(?:至|-|—)(?:(\d+)年)?(?:(\d+)月)?(\d+)日", value
+ )
for groups in match:
groups = [_cast_int(i) for i in groups]
assert len(groups) == 6, groups
- start = self.parent.get_date(year=groups[0],
- month=groups[1], day=groups[2])
- end = self.parent.get_date(year=groups[3],
- month=groups[4], day=groups[5])
+ start = self.parent.get_date(year=groups[0], month=groups[1], day=groups[2])
+ end = self.parent.get_date(year=groups[3], month=groups[4], day=groups[5])
for i in range((end - start).days + 1):
yield start + timedelta(days=i)
def _extract_dates_3(self, value: str) -> Iterator[date]:
match = re.findall(
- r'(?:(\d+)年)?(?:(\d+)月)?(\d+)日(?:([^)]+))?'
- r'(?:、(?:(\d+)年)?(?:(\d+)月)?(\d+)日(?:([^)]+))?)+',
- value)
+ r"(?:(\d+)年)?(?:(\d+)月)?(\d+)日(?:([^)]+))?"
+ r"(?:、(?:(\d+)年)?(?:(\d+)月)?(\d+)日(?:([^)]+))?)+",
+ value,
+ )
for groups in match:
groups = [_cast_int(i) for i in groups]
assert not (len(groups) % 3), groups
for i in range(0, len(groups), 3):
- yield self.parent.get_date(year=groups[i], month=groups[i+1], day=groups[i+2])
+ yield self.parent.get_date(
+ year=groups[i], month=groups[i + 1], day=groups[i + 2]
+ )
- date_extraction_methods = [
- _extract_dates_1,
- _extract_dates_2,
- _extract_dates_3
- ]
+ date_extraction_methods = [_extract_dates_1, _extract_dates_2, _extract_dates_3]
def parse(self) -> Iterator[dict]:
"""Parse days with memory
@@ -273,36 +280,24 @@ class SentenceParser:
yield i
def _parse_rest_1(self):
- match = re.match(r'(.+)(放假|补休|调休|公休)+(?:\d+天)?$', self.sentence)
+ match = re.match(r"(.+)(放假|补休|调休|公休)+(?:\d+天)?$", self.sentence)
if match:
for i in self.extract_dates(match.group(1)):
- yield {
- 'date': i,
- 'isOffDay': True
- }
+ yield {"date": i, "isOffDay": True}
def _parse_work_1(self):
- match = re.match('(.+)上班$', self.sentence)
+ match = re.match("(.+)上班$", self.sentence)
if match:
for i in self.extract_dates(match.group(1)):
- yield {
- 'date': i,
- 'isOffDay': False
- }
+ yield {"date": i, "isOffDay": False}
def _parse_shift_1(self):
- match = re.match('(.+)调至(.+)', self.sentence)
+ match = re.match("(.+)调至(.+)", self.sentence)
if match:
for i in self.extract_dates(match.group(1)):
- yield {
- 'date': i,
- 'isOffDay': False
- }
+ yield {"date": i, "isOffDay": False}
for i in self.extract_dates(match.group(2)):
- yield {
- 'date': i,
- 'isOffDay': True
- }
+ yield {"date": i, "isOffDay": True}
def _parse_special(self):
for i in self.special_cases.get(self.sentence, []):
@@ -328,49 +323,50 @@ def parse_paper(year: int, url: str) -> Iterator[dict]:
"""
paper = get_paper(url)
rules = get_rules(paper)
- ret = ({'name': name, **i}
- for name, description in rules
- for i in DescriptionParser(description, year).parse())
+ ret = (
+ {"name": name, **i}
+ for name, description in rules
+ for i in DescriptionParser(description, year).parse()
+ )
try:
for i in ret:
yield i
except NotImplementedError as ex:
- raise RuntimeError('Can not parse paper', url) from ex
+ raise RuntimeError("Can not parse paper", url) from ex
def fetch_holiday(year: int):
- """Fetch holiday data. """
+ """Fetch holiday data."""
papers = get_paper_urls(year)
days = dict()
- for k in (j
- for i in papers
- for j in parse_paper(year, i)):
- days[k['date']] = k
+ for k in (j for i in papers for j in parse_paper(year, i)):
+ days[k["date"]] = k
return {
- 'year': year,
- 'papers': papers,
- 'days': sorted(days.values(), key=lambda x: x['date'])
+ "year": year,
+ "papers": papers,
+ "days": sorted(days.values(), key=lambda x: x["date"]),
}
def main():
parser = argparse.ArgumentParser()
- parser.add_argument('year', type=int)
+ parser.add_argument("year", type=int)
args = parser.parse_args()
year = args.year
- print(json.dumps(fetch_holiday(year),
- indent=4,
- ensure_ascii=False,
- cls=CustomJSONEncoder))
+ print(
+ json.dumps(
+ fetch_holiday(year), indent=4, ensure_ascii=False, cls=CustomJSONEncoder
+ )
+ )
class CustomJSONEncoder(json.JSONEncoder):
- """Custom json encoder. """
+ """Custom json encoder."""
def default(self, o):
# pylint:disable=method-hidden
@@ -380,5 +376,5 @@ class CustomJSONEncoder(json.JSONEncoder):
return super().default(o)
-if __name__ == '__main__':
+if __name__ == "__main__":
main()
diff --git a/tests/filetools.py b/tests/filetools.py
index 31de6d6..1d1437b 100644
--- a/tests/filetools.py
+++ b/tests/filetools.py
@@ -1,6 +1,7 @@
"""Tools for files. """
import os
+
__dirname__ = os.path.abspath(os.path.dirname(__file__))
diff --git a/tests/test_fetch_holidays.py b/tests/test_fetch_holidays.py
index c66e130..3161b1f 100644
--- a/tests/test_fetch_holidays.py
+++ b/tests/test_fetch_holidays.py
@@ -3,40 +3,54 @@ import json
import pytest
-from fetch_holidays import (CustomJSONEncoder, DescriptionParser, get_paper,
- get_paper_urls, get_rules)
+from fetch_holidays import (
+ CustomJSONEncoder,
+ DescriptionParser,
+ get_paper,
+ get_paper_urls,
+ get_rules,
+)
from .filetools import _file_path
def test_get_paper_urls():
assert get_paper_urls(2019) == [
- 'http://www.gov.cn/zhengce/content/2018-12/06/content_5346276.htm',
- 'http://www.gov.cn/zhengce/content/2019-03/22/content_5375877.htm',
+ "http://www.gov.cn/zhengce/content/2018-12/06/content_5346276.htm",
+ "http://www.gov.cn/zhengce/content/2019-03/22/content_5375877.htm",
]
def test_get_rules():
- assert (
- list(get_rules(get_paper(
- 'http://www.gov.cn/zhengce/content/2019-03/22/content_5375877.htm')))
- == [('劳动节',
- '2019年5月1日至4日放假调休,共4天。4月28日(星期日)、5月5日(星期日)上班。')])
+ assert list(
+ get_rules(
+ get_paper(
+ "http://www.gov.cn/zhengce/content/2019-03/22/content_5375877.htm"
+ )
+ )
+ ) == [("劳动节", "2019年5月1日至4日放假调休,共4天。4月28日(星期日)、5月5日(星期日)上班。")]
def _normalize(iterable):
- return sorted(json.loads(json.dumps(list(iterable), cls=CustomJSONEncoder)),
- key=lambda x: x['date'])
+ return sorted(
+ json.loads(json.dumps(list(iterable), cls=CustomJSONEncoder)),
+ key=lambda x: x["date"],
+ )
def _description_parsing_cases():
- with open(_file_path('description_parsing_cases.json'), 'r', encoding='utf-8', ) as f:
+ with open(
+ _file_path("description_parsing_cases.json"),
+ "r",
+ encoding="utf-8",
+ ) as f:
return json.load(f)
-@pytest.mark.parametrize('case', _description_parsing_cases())
+@pytest.mark.parametrize("case", _description_parsing_cases())
def test_parse_description(case):
- year, description, expected = case['year'], case['description'], case['expected']
- assert _normalize(DescriptionParser(
- description, year).parse()) == _normalize(expected), case
+ year, description, expected = case["year"], case["description"], case["expected"]
+ assert _normalize(DescriptionParser(description, year).parse()) == _normalize(
+ expected
+ ), case