diff --git a/2016.json b/2016.json index ba229ab..0073de8 100644 Binary files a/2016.json and b/2016.json differ diff --git a/fetch_holidays.py b/fetch_holidays.py index 5739c9b..7e9095f 100755 --- a/fetch_holidays.py +++ b/fetch_holidays.py @@ -5,7 +5,7 @@ import argparse import json import re from datetime import date, timedelta -from typing import List, Optional +from typing import Iterator, List, Optional, Tuple import bs4 import requests @@ -30,26 +30,46 @@ def get_paper_urls(year: int) -> List[str]: body = requests.get(url).text ret = re.findall( r'
  • ', body, flags=re.S) - assert all( - re.match( - r'http://www.gov.cn/zhengce/content/\d{4}-\d{2}/\d{2}/content_\d+.htm', i) - for i in ret), 'Site changed, need human verify' return ret -def get_paper(url): +def get_paper(url: str) -> str: + """Extract paper text from url. + + Args: + url (str): Paper url. + + Returns: + str: Extracted paper text. + """ + + assert re.match(r'http://www.gov.cn/zhengce/content/\d{4}-\d{2}/\d{2}/content_\d+.htm', + url), 'Site changed, need human verify' + response = requests.get(url) response.encoding = 'utf-8' soup = bs4.BeautifulSoup(response.text, features='html.parser') container = soup.find('td', class_='b12c') assert container, f'Can not get paper container from url: {url}' - ret = container.get_text().replace('\u3000', '\n') + ret = container.get_text().replace('\u3000\u3000', '\n') assert ret, f'Can not get paper context from url: {url}' return ret -def get_rules(paper: str): +def get_rules(paper: str) -> Iterator[Tuple[str, str]]: + """Extract rules from paper. + + Args: + paper (str): Paper text + + Raises: + NotImplementedError: When find no rules. + + Returns: + Iterator[Tuple[str, str]]: (name, description) + """ + lines: list = paper.splitlines() count = 0 for i in sorted(set(lines), key=lines.index): @@ -74,11 +94,27 @@ class SentenceParser: self.year = year self._date_memory = set() - def extract_dates(self, value) -> List[date]: + def extract_dates(self, text: str) -> Iterator[date]: + """Extract date from text. + + Args: + text (str): Text to extract + + Returns: + Iterator[date]: Extracted dates. + """ + + count = 0 for method in self.date_extraction_methods: - for i in method(self, value): - if i not in self._date_memory: - yield i + for i in method(self, text): + count += 1 + if i in self._date_memory: + continue + self._date_memory.add(i) + yield i + + if not count: + raise NotImplementedError(text) def get_date(self, year: Optional[int], month: int, day: int) -> date: """Get date in context. @@ -147,12 +183,19 @@ class SentenceParser: _extract_dates_3 ] - def parse(self, memory): + def parse(self, memory: set) -> Iterator[dict]: + """Parse days with memory + + Args: + memory (set): Date memory + + Returns: + Iterator[dict]: Days without name field. + """ + self._date_memory = memory for method in self.parsing_methods: for i in method(self): - if i['date'] in self._date_memory: - continue yield i def _parse_rest_1(self): @@ -201,7 +244,7 @@ class DescriptionParser: self.description = description self._date_memory = set() - def parse(self, year: int): + def parse(self, year: int) -> Iterator[dict]: """Generator for description parsing result. Args: @@ -211,7 +254,6 @@ class DescriptionParser: self._date_memory.clear() for i in re.split(',|。', self.description): for j in SentenceParser(i, year).parse(self._date_memory): - self._date_memory.add(j['date']) yield j if not self._date_memory: diff --git a/requirements.txt b/requirements.txt index 792285a..4f0073b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ requests ~= 2.21.0 -beautifulsoup4 ~= 4.7.1 \ No newline at end of file +beautifulsoup4 ~= 4.7.1 +tqdm ~= 4.30.0 diff --git a/test_fetch_holidays.py b/test_fetch_holidays.py index f8a1e57..b261971 100644 --- a/test_fetch_holidays.py +++ b/test_fetch_holidays.py @@ -1,4 +1,4 @@ - +"""Test module `fetch_holidays`. """ import json import sys @@ -6,7 +6,8 @@ from fetch_holidays import CustomJSONEncoder, DescriptionParser def _normalize(iterable): - return sorted(json.loads(json.dumps(list(iterable), cls=CustomJSONEncoder)), key=lambda x: x['date']) + return sorted(json.loads(json.dumps(list(iterable), cls=CustomJSONEncoder)), + key=lambda x: x['date']) def _generate_tests(): diff --git a/update.py b/update.py index 1106ddd..d9173ce 100755 --- a/update.py +++ b/update.py @@ -1,9 +1,14 @@ #!/usr/bin/env python3 +"""Script for updating data. """ + +import argparse import json import os import subprocess from datetime import datetime, timedelta, tzinfo +from tqdm import tqdm + from fetch_holidays import CustomJSONEncoder, fetch_holiday @@ -28,7 +33,16 @@ def _file_path(*other): return os.path.join(__dirname__, *other) -def update_data(year): +def update_data(year: int) -> str: + """Update and store data for a year. + + Args: + year (int): Year + + Returns: + str: Stored data path + """ + filename = _file_path(f'{year}.json') with open(filename, 'w', encoding='utf-8', newline='\n') as f: json.dump(fetch_holiday(year), f, @@ -39,11 +53,18 @@ def update_data(year): def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--all', action='store_true') + args = parser.parse_args() + now = datetime.now(ChinaTimezone()) filenames = [] - filenames.append(update_data(now.year)) - filenames.append(update_data(now.year + 1)) + progress = tqdm(range(2014 if args.all else now.year, now.year + 2)) + for i in progress: + progress.set_description(f'Updating {i} data') + filename = update_data(i) + filenames.append(filename) subprocess.run(['git', 'add', *filenames], check=True) diff = subprocess.run(['git', 'diff', '--stat', '--cached'],