Refactor code

This commit is contained in:
NateScarlet 2019-03-09 12:27:41 +08:00
parent 155f06ff3f
commit 350ceab783
No known key found for this signature in database
GPG Key ID: 5C242793B070309C
5 changed files with 88 additions and 23 deletions

BIN
2016.json

Binary file not shown.

View File

@ -5,7 +5,7 @@ import argparse
import json
import re
from datetime import date, timedelta
from typing import List, Optional
from typing import Iterator, List, Optional, Tuple
import bs4
import requests
@ -30,26 +30,46 @@ def get_paper_urls(year: int) -> List[str]:
body = requests.get(url).text
ret = re.findall(
r'<li class="res-list".*?<a href="(.+?)".*?</li>', body, flags=re.S)
assert all(
re.match(
r'http://www.gov.cn/zhengce/content/\d{4}-\d{2}/\d{2}/content_\d+.htm', i)
for i in ret), 'Site changed, need human verify'
return ret
def get_paper(url):
def get_paper(url: str) -> str:
"""Extract paper text from url.
Args:
url (str): Paper url.
Returns:
str: Extracted paper text.
"""
assert re.match(r'http://www.gov.cn/zhengce/content/\d{4}-\d{2}/\d{2}/content_\d+.htm',
url), 'Site changed, need human verify'
response = requests.get(url)
response.encoding = 'utf-8'
soup = bs4.BeautifulSoup(response.text, features='html.parser')
container = soup.find('td', class_='b12c')
assert container, f'Can not get paper container from url: {url}'
ret = container.get_text().replace('\u3000', '\n')
ret = container.get_text().replace('\u3000\u3000', '\n')
assert ret, f'Can not get paper context from url: {url}'
return ret
def get_rules(paper: str):
def get_rules(paper: str) -> Iterator[Tuple[str, str]]:
"""Extract rules from paper.
Args:
paper (str): Paper text
Raises:
NotImplementedError: When find no rules.
Returns:
Iterator[Tuple[str, str]]: (name, description)
"""
lines: list = paper.splitlines()
count = 0
for i in sorted(set(lines), key=lines.index):
@ -74,11 +94,27 @@ class SentenceParser:
self.year = year
self._date_memory = set()
def extract_dates(self, value) -> List[date]:
def extract_dates(self, text: str) -> Iterator[date]:
"""Extract date from text.
Args:
text (str): Text to extract
Returns:
Iterator[date]: Extracted dates.
"""
count = 0
for method in self.date_extraction_methods:
for i in method(self, value):
if i not in self._date_memory:
yield i
for i in method(self, text):
count += 1
if i in self._date_memory:
continue
self._date_memory.add(i)
yield i
if not count:
raise NotImplementedError(text)
def get_date(self, year: Optional[int], month: int, day: int) -> date:
"""Get date in context.
@ -147,12 +183,19 @@ class SentenceParser:
_extract_dates_3
]
def parse(self, memory):
def parse(self, memory: set) -> Iterator[dict]:
"""Parse days with memory
Args:
memory (set): Date memory
Returns:
Iterator[dict]: Days without name field.
"""
self._date_memory = memory
for method in self.parsing_methods:
for i in method(self):
if i['date'] in self._date_memory:
continue
yield i
def _parse_rest_1(self):
@ -201,7 +244,7 @@ class DescriptionParser:
self.description = description
self._date_memory = set()
def parse(self, year: int):
def parse(self, year: int) -> Iterator[dict]:
"""Generator for description parsing result.
Args:
@ -211,7 +254,6 @@ class DescriptionParser:
self._date_memory.clear()
for i in re.split('|。', self.description):
for j in SentenceParser(i, year).parse(self._date_memory):
self._date_memory.add(j['date'])
yield j
if not self._date_memory:

View File

@ -1,2 +1,3 @@
requests ~= 2.21.0
beautifulsoup4 ~= 4.7.1
beautifulsoup4 ~= 4.7.1
tqdm ~= 4.30.0

View File

@ -1,4 +1,4 @@
"""Test module `fetch_holidays`. """
import json
import sys
@ -6,7 +6,8 @@ from fetch_holidays import CustomJSONEncoder, DescriptionParser
def _normalize(iterable):
return sorted(json.loads(json.dumps(list(iterable), cls=CustomJSONEncoder)), key=lambda x: x['date'])
return sorted(json.loads(json.dumps(list(iterable), cls=CustomJSONEncoder)),
key=lambda x: x['date'])
def _generate_tests():

View File

@ -1,9 +1,14 @@
#!/usr/bin/env python3
"""Script for updating data. """
import argparse
import json
import os
import subprocess
from datetime import datetime, timedelta, tzinfo
from tqdm import tqdm
from fetch_holidays import CustomJSONEncoder, fetch_holiday
@ -28,7 +33,16 @@ def _file_path(*other):
return os.path.join(__dirname__, *other)
def update_data(year):
def update_data(year: int) -> str:
"""Update and store data for a year.
Args:
year (int): Year
Returns:
str: Stored data path
"""
filename = _file_path(f'{year}.json')
with open(filename, 'w', encoding='utf-8', newline='\n') as f:
json.dump(fetch_holiday(year), f,
@ -39,11 +53,18 @@ def update_data(year):
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--all', action='store_true')
args = parser.parse_args()
now = datetime.now(ChinaTimezone())
filenames = []
filenames.append(update_data(now.year))
filenames.append(update_data(now.year + 1))
progress = tqdm(range(2014 if args.all else now.year, now.year + 2))
for i in progress:
progress.set_description(f'Updating {i} data')
filename = update_data(i)
filenames.append(filename)
subprocess.run(['git', 'add', *filenames], check=True)
diff = subprocess.run(['git', 'diff', '--stat', '--cached'],