diff --git a/2016.json b/2016.json
index ba229ab..0073de8 100644
Binary files a/2016.json and b/2016.json differ
diff --git a/fetch_holidays.py b/fetch_holidays.py
index 5739c9b..7e9095f 100755
--- a/fetch_holidays.py
+++ b/fetch_holidays.py
@@ -5,7 +5,7 @@ import argparse
import json
import re
from datetime import date, timedelta
-from typing import List, Optional
+from typing import Iterator, List, Optional, Tuple
import bs4
import requests
@@ -30,26 +30,46 @@ def get_paper_urls(year: int) -> List[str]:
body = requests.get(url).text
ret = re.findall(
r'
', body, flags=re.S)
- assert all(
- re.match(
- r'http://www.gov.cn/zhengce/content/\d{4}-\d{2}/\d{2}/content_\d+.htm', i)
- for i in ret), 'Site changed, need human verify'
return ret
-def get_paper(url):
+def get_paper(url: str) -> str:
+ """Extract paper text from url.
+
+ Args:
+ url (str): Paper url.
+
+ Returns:
+ str: Extracted paper text.
+ """
+
+ assert re.match(r'http://www.gov.cn/zhengce/content/\d{4}-\d{2}/\d{2}/content_\d+.htm',
+ url), 'Site changed, need human verify'
+
response = requests.get(url)
response.encoding = 'utf-8'
soup = bs4.BeautifulSoup(response.text, features='html.parser')
container = soup.find('td', class_='b12c')
assert container, f'Can not get paper container from url: {url}'
- ret = container.get_text().replace('\u3000', '\n')
+ ret = container.get_text().replace('\u3000\u3000', '\n')
assert ret, f'Can not get paper context from url: {url}'
return ret
-def get_rules(paper: str):
+def get_rules(paper: str) -> Iterator[Tuple[str, str]]:
+ """Extract rules from paper.
+
+ Args:
+ paper (str): Paper text
+
+ Raises:
+ NotImplementedError: When find no rules.
+
+ Returns:
+ Iterator[Tuple[str, str]]: (name, description)
+ """
+
lines: list = paper.splitlines()
count = 0
for i in sorted(set(lines), key=lines.index):
@@ -74,11 +94,27 @@ class SentenceParser:
self.year = year
self._date_memory = set()
- def extract_dates(self, value) -> List[date]:
+ def extract_dates(self, text: str) -> Iterator[date]:
+ """Extract date from text.
+
+ Args:
+ text (str): Text to extract
+
+ Returns:
+ Iterator[date]: Extracted dates.
+ """
+
+ count = 0
for method in self.date_extraction_methods:
- for i in method(self, value):
- if i not in self._date_memory:
- yield i
+ for i in method(self, text):
+ count += 1
+ if i in self._date_memory:
+ continue
+ self._date_memory.add(i)
+ yield i
+
+ if not count:
+ raise NotImplementedError(text)
def get_date(self, year: Optional[int], month: int, day: int) -> date:
"""Get date in context.
@@ -147,12 +183,19 @@ class SentenceParser:
_extract_dates_3
]
- def parse(self, memory):
+ def parse(self, memory: set) -> Iterator[dict]:
+ """Parse days with memory
+
+ Args:
+ memory (set): Date memory
+
+ Returns:
+ Iterator[dict]: Days without name field.
+ """
+
self._date_memory = memory
for method in self.parsing_methods:
for i in method(self):
- if i['date'] in self._date_memory:
- continue
yield i
def _parse_rest_1(self):
@@ -201,7 +244,7 @@ class DescriptionParser:
self.description = description
self._date_memory = set()
- def parse(self, year: int):
+ def parse(self, year: int) -> Iterator[dict]:
"""Generator for description parsing result.
Args:
@@ -211,7 +254,6 @@ class DescriptionParser:
self._date_memory.clear()
for i in re.split(',|。', self.description):
for j in SentenceParser(i, year).parse(self._date_memory):
- self._date_memory.add(j['date'])
yield j
if not self._date_memory:
diff --git a/requirements.txt b/requirements.txt
index 792285a..4f0073b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,3 @@
requests ~= 2.21.0
-beautifulsoup4 ~= 4.7.1
\ No newline at end of file
+beautifulsoup4 ~= 4.7.1
+tqdm ~= 4.30.0
diff --git a/test_fetch_holidays.py b/test_fetch_holidays.py
index f8a1e57..b261971 100644
--- a/test_fetch_holidays.py
+++ b/test_fetch_holidays.py
@@ -1,4 +1,4 @@
-
+"""Test module `fetch_holidays`. """
import json
import sys
@@ -6,7 +6,8 @@ from fetch_holidays import CustomJSONEncoder, DescriptionParser
def _normalize(iterable):
- return sorted(json.loads(json.dumps(list(iterable), cls=CustomJSONEncoder)), key=lambda x: x['date'])
+ return sorted(json.loads(json.dumps(list(iterable), cls=CustomJSONEncoder)),
+ key=lambda x: x['date'])
def _generate_tests():
diff --git a/update.py b/update.py
index 1106ddd..d9173ce 100755
--- a/update.py
+++ b/update.py
@@ -1,9 +1,14 @@
#!/usr/bin/env python3
+"""Script for updating data. """
+
+import argparse
import json
import os
import subprocess
from datetime import datetime, timedelta, tzinfo
+from tqdm import tqdm
+
from fetch_holidays import CustomJSONEncoder, fetch_holiday
@@ -28,7 +33,16 @@ def _file_path(*other):
return os.path.join(__dirname__, *other)
-def update_data(year):
+def update_data(year: int) -> str:
+ """Update and store data for a year.
+
+ Args:
+ year (int): Year
+
+ Returns:
+ str: Stored data path
+ """
+
filename = _file_path(f'{year}.json')
with open(filename, 'w', encoding='utf-8', newline='\n') as f:
json.dump(fetch_holiday(year), f,
@@ -39,11 +53,18 @@ def update_data(year):
def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--all', action='store_true')
+ args = parser.parse_args()
+
now = datetime.now(ChinaTimezone())
filenames = []
- filenames.append(update_data(now.year))
- filenames.append(update_data(now.year + 1))
+ progress = tqdm(range(2014 if args.all else now.year, now.year + 2))
+ for i in progress:
+ progress.set_description(f'Updating {i} data')
+ filename = update_data(i)
+ filenames.append(filename)
subprocess.run(['git', 'add', *filenames], check=True)
diff = subprocess.run(['git', 'diff', '--stat', '--cached'],