Refactor code

This commit is contained in:
NateScarlet 2019-03-09 12:27:41 +08:00
parent 155f06ff3f
commit 350ceab783
No known key found for this signature in database
GPG Key ID: 5C242793B070309C
5 changed files with 88 additions and 23 deletions

BIN
2016.json

Binary file not shown.

View File

@ -5,7 +5,7 @@ import argparse
import json import json
import re import re
from datetime import date, timedelta from datetime import date, timedelta
from typing import List, Optional from typing import Iterator, List, Optional, Tuple
import bs4 import bs4
import requests import requests
@ -30,26 +30,46 @@ def get_paper_urls(year: int) -> List[str]:
body = requests.get(url).text body = requests.get(url).text
ret = re.findall( ret = re.findall(
r'<li class="res-list".*?<a href="(.+?)".*?</li>', body, flags=re.S) r'<li class="res-list".*?<a href="(.+?)".*?</li>', body, flags=re.S)
assert all(
re.match(
r'http://www.gov.cn/zhengce/content/\d{4}-\d{2}/\d{2}/content_\d+.htm', i)
for i in ret), 'Site changed, need human verify'
return ret return ret
def get_paper(url): def get_paper(url: str) -> str:
"""Extract paper text from url.
Args:
url (str): Paper url.
Returns:
str: Extracted paper text.
"""
assert re.match(r'http://www.gov.cn/zhengce/content/\d{4}-\d{2}/\d{2}/content_\d+.htm',
url), 'Site changed, need human verify'
response = requests.get(url) response = requests.get(url)
response.encoding = 'utf-8' response.encoding = 'utf-8'
soup = bs4.BeautifulSoup(response.text, features='html.parser') soup = bs4.BeautifulSoup(response.text, features='html.parser')
container = soup.find('td', class_='b12c') container = soup.find('td', class_='b12c')
assert container, f'Can not get paper container from url: {url}' assert container, f'Can not get paper container from url: {url}'
ret = container.get_text().replace('\u3000', '\n') ret = container.get_text().replace('\u3000\u3000', '\n')
assert ret, f'Can not get paper context from url: {url}' assert ret, f'Can not get paper context from url: {url}'
return ret return ret
def get_rules(paper: str): def get_rules(paper: str) -> Iterator[Tuple[str, str]]:
"""Extract rules from paper.
Args:
paper (str): Paper text
Raises:
NotImplementedError: When find no rules.
Returns:
Iterator[Tuple[str, str]]: (name, description)
"""
lines: list = paper.splitlines() lines: list = paper.splitlines()
count = 0 count = 0
for i in sorted(set(lines), key=lines.index): for i in sorted(set(lines), key=lines.index):
@ -74,11 +94,27 @@ class SentenceParser:
self.year = year self.year = year
self._date_memory = set() self._date_memory = set()
def extract_dates(self, value) -> List[date]: def extract_dates(self, text: str) -> Iterator[date]:
"""Extract date from text.
Args:
text (str): Text to extract
Returns:
Iterator[date]: Extracted dates.
"""
count = 0
for method in self.date_extraction_methods: for method in self.date_extraction_methods:
for i in method(self, value): for i in method(self, text):
if i not in self._date_memory: count += 1
yield i if i in self._date_memory:
continue
self._date_memory.add(i)
yield i
if not count:
raise NotImplementedError(text)
def get_date(self, year: Optional[int], month: int, day: int) -> date: def get_date(self, year: Optional[int], month: int, day: int) -> date:
"""Get date in context. """Get date in context.
@ -147,12 +183,19 @@ class SentenceParser:
_extract_dates_3 _extract_dates_3
] ]
def parse(self, memory): def parse(self, memory: set) -> Iterator[dict]:
"""Parse days with memory
Args:
memory (set): Date memory
Returns:
Iterator[dict]: Days without name field.
"""
self._date_memory = memory self._date_memory = memory
for method in self.parsing_methods: for method in self.parsing_methods:
for i in method(self): for i in method(self):
if i['date'] in self._date_memory:
continue
yield i yield i
def _parse_rest_1(self): def _parse_rest_1(self):
@ -201,7 +244,7 @@ class DescriptionParser:
self.description = description self.description = description
self._date_memory = set() self._date_memory = set()
def parse(self, year: int): def parse(self, year: int) -> Iterator[dict]:
"""Generator for description parsing result. """Generator for description parsing result.
Args: Args:
@ -211,7 +254,6 @@ class DescriptionParser:
self._date_memory.clear() self._date_memory.clear()
for i in re.split('|。', self.description): for i in re.split('|。', self.description):
for j in SentenceParser(i, year).parse(self._date_memory): for j in SentenceParser(i, year).parse(self._date_memory):
self._date_memory.add(j['date'])
yield j yield j
if not self._date_memory: if not self._date_memory:

View File

@ -1,2 +1,3 @@
requests ~= 2.21.0 requests ~= 2.21.0
beautifulsoup4 ~= 4.7.1 beautifulsoup4 ~= 4.7.1
tqdm ~= 4.30.0

View File

@ -1,4 +1,4 @@
"""Test module `fetch_holidays`. """
import json import json
import sys import sys
@ -6,7 +6,8 @@ from fetch_holidays import CustomJSONEncoder, DescriptionParser
def _normalize(iterable): def _normalize(iterable):
return sorted(json.loads(json.dumps(list(iterable), cls=CustomJSONEncoder)), key=lambda x: x['date']) return sorted(json.loads(json.dumps(list(iterable), cls=CustomJSONEncoder)),
key=lambda x: x['date'])
def _generate_tests(): def _generate_tests():

View File

@ -1,9 +1,14 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
"""Script for updating data. """
import argparse
import json import json
import os import os
import subprocess import subprocess
from datetime import datetime, timedelta, tzinfo from datetime import datetime, timedelta, tzinfo
from tqdm import tqdm
from fetch_holidays import CustomJSONEncoder, fetch_holiday from fetch_holidays import CustomJSONEncoder, fetch_holiday
@ -28,7 +33,16 @@ def _file_path(*other):
return os.path.join(__dirname__, *other) return os.path.join(__dirname__, *other)
def update_data(year): def update_data(year: int) -> str:
"""Update and store data for a year.
Args:
year (int): Year
Returns:
str: Stored data path
"""
filename = _file_path(f'{year}.json') filename = _file_path(f'{year}.json')
with open(filename, 'w', encoding='utf-8', newline='\n') as f: with open(filename, 'w', encoding='utf-8', newline='\n') as f:
json.dump(fetch_holiday(year), f, json.dump(fetch_holiday(year), f,
@ -39,11 +53,18 @@ def update_data(year):
def main(): def main():
parser = argparse.ArgumentParser()
parser.add_argument('--all', action='store_true')
args = parser.parse_args()
now = datetime.now(ChinaTimezone()) now = datetime.now(ChinaTimezone())
filenames = [] filenames = []
filenames.append(update_data(now.year)) progress = tqdm(range(2014 if args.all else now.year, now.year + 2))
filenames.append(update_data(now.year + 1)) for i in progress:
progress.set_description(f'Updating {i} data')
filename = update_data(i)
filenames.append(filename)
subprocess.run(['git', 'add', *filenames], check=True) subprocess.run(['git', 'add', *filenames], check=True)
diff = subprocess.run(['git', 'diff', '--stat', '--cached'], diff = subprocess.run(['git', 'diff', '--stat', '--cached'],