Refactor code
This commit is contained in:
parent
155f06ff3f
commit
350ceab783
|
|
@ -5,7 +5,7 @@ import argparse
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
from datetime import date, timedelta
|
from datetime import date, timedelta
|
||||||
from typing import List, Optional
|
from typing import Iterator, List, Optional, Tuple
|
||||||
|
|
||||||
import bs4
|
import bs4
|
||||||
import requests
|
import requests
|
||||||
|
|
@ -30,26 +30,46 @@ def get_paper_urls(year: int) -> List[str]:
|
||||||
body = requests.get(url).text
|
body = requests.get(url).text
|
||||||
ret = re.findall(
|
ret = re.findall(
|
||||||
r'<li class="res-list".*?<a href="(.+?)".*?</li>', body, flags=re.S)
|
r'<li class="res-list".*?<a href="(.+?)".*?</li>', body, flags=re.S)
|
||||||
assert all(
|
|
||||||
re.match(
|
|
||||||
r'http://www.gov.cn/zhengce/content/\d{4}-\d{2}/\d{2}/content_\d+.htm', i)
|
|
||||||
for i in ret), 'Site changed, need human verify'
|
|
||||||
|
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
|
||||||
def get_paper(url):
|
def get_paper(url: str) -> str:
|
||||||
|
"""Extract paper text from url.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url (str): Paper url.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Extracted paper text.
|
||||||
|
"""
|
||||||
|
|
||||||
|
assert re.match(r'http://www.gov.cn/zhengce/content/\d{4}-\d{2}/\d{2}/content_\d+.htm',
|
||||||
|
url), 'Site changed, need human verify'
|
||||||
|
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
response.encoding = 'utf-8'
|
response.encoding = 'utf-8'
|
||||||
soup = bs4.BeautifulSoup(response.text, features='html.parser')
|
soup = bs4.BeautifulSoup(response.text, features='html.parser')
|
||||||
container = soup.find('td', class_='b12c')
|
container = soup.find('td', class_='b12c')
|
||||||
assert container, f'Can not get paper container from url: {url}'
|
assert container, f'Can not get paper container from url: {url}'
|
||||||
ret = container.get_text().replace('\u3000', '\n')
|
ret = container.get_text().replace('\u3000\u3000', '\n')
|
||||||
assert ret, f'Can not get paper context from url: {url}'
|
assert ret, f'Can not get paper context from url: {url}'
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
|
||||||
def get_rules(paper: str):
|
def get_rules(paper: str) -> Iterator[Tuple[str, str]]:
|
||||||
|
"""Extract rules from paper.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
paper (str): Paper text
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
NotImplementedError: When find no rules.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Iterator[Tuple[str, str]]: (name, description)
|
||||||
|
"""
|
||||||
|
|
||||||
lines: list = paper.splitlines()
|
lines: list = paper.splitlines()
|
||||||
count = 0
|
count = 0
|
||||||
for i in sorted(set(lines), key=lines.index):
|
for i in sorted(set(lines), key=lines.index):
|
||||||
|
|
@ -74,12 +94,28 @@ class SentenceParser:
|
||||||
self.year = year
|
self.year = year
|
||||||
self._date_memory = set()
|
self._date_memory = set()
|
||||||
|
|
||||||
def extract_dates(self, value) -> List[date]:
|
def extract_dates(self, text: str) -> Iterator[date]:
|
||||||
|
"""Extract date from text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (str): Text to extract
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Iterator[date]: Extracted dates.
|
||||||
|
"""
|
||||||
|
|
||||||
|
count = 0
|
||||||
for method in self.date_extraction_methods:
|
for method in self.date_extraction_methods:
|
||||||
for i in method(self, value):
|
for i in method(self, text):
|
||||||
if i not in self._date_memory:
|
count += 1
|
||||||
|
if i in self._date_memory:
|
||||||
|
continue
|
||||||
|
self._date_memory.add(i)
|
||||||
yield i
|
yield i
|
||||||
|
|
||||||
|
if not count:
|
||||||
|
raise NotImplementedError(text)
|
||||||
|
|
||||||
def get_date(self, year: Optional[int], month: int, day: int) -> date:
|
def get_date(self, year: Optional[int], month: int, day: int) -> date:
|
||||||
"""Get date in context.
|
"""Get date in context.
|
||||||
|
|
||||||
|
|
@ -147,12 +183,19 @@ class SentenceParser:
|
||||||
_extract_dates_3
|
_extract_dates_3
|
||||||
]
|
]
|
||||||
|
|
||||||
def parse(self, memory):
|
def parse(self, memory: set) -> Iterator[dict]:
|
||||||
|
"""Parse days with memory
|
||||||
|
|
||||||
|
Args:
|
||||||
|
memory (set): Date memory
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Iterator[dict]: Days without name field.
|
||||||
|
"""
|
||||||
|
|
||||||
self._date_memory = memory
|
self._date_memory = memory
|
||||||
for method in self.parsing_methods:
|
for method in self.parsing_methods:
|
||||||
for i in method(self):
|
for i in method(self):
|
||||||
if i['date'] in self._date_memory:
|
|
||||||
continue
|
|
||||||
yield i
|
yield i
|
||||||
|
|
||||||
def _parse_rest_1(self):
|
def _parse_rest_1(self):
|
||||||
|
|
@ -201,7 +244,7 @@ class DescriptionParser:
|
||||||
self.description = description
|
self.description = description
|
||||||
self._date_memory = set()
|
self._date_memory = set()
|
||||||
|
|
||||||
def parse(self, year: int):
|
def parse(self, year: int) -> Iterator[dict]:
|
||||||
"""Generator for description parsing result.
|
"""Generator for description parsing result.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|
@ -211,7 +254,6 @@ class DescriptionParser:
|
||||||
self._date_memory.clear()
|
self._date_memory.clear()
|
||||||
for i in re.split(',|。', self.description):
|
for i in re.split(',|。', self.description):
|
||||||
for j in SentenceParser(i, year).parse(self._date_memory):
|
for j in SentenceParser(i, year).parse(self._date_memory):
|
||||||
self._date_memory.add(j['date'])
|
|
||||||
yield j
|
yield j
|
||||||
|
|
||||||
if not self._date_memory:
|
if not self._date_memory:
|
||||||
|
|
|
||||||
|
|
@ -1,2 +1,3 @@
|
||||||
requests ~= 2.21.0
|
requests ~= 2.21.0
|
||||||
beautifulsoup4 ~= 4.7.1
|
beautifulsoup4 ~= 4.7.1
|
||||||
|
tqdm ~= 4.30.0
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
|
"""Test module `fetch_holidays`. """
|
||||||
import json
|
import json
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
|
@ -6,7 +6,8 @@ from fetch_holidays import CustomJSONEncoder, DescriptionParser
|
||||||
|
|
||||||
|
|
||||||
def _normalize(iterable):
|
def _normalize(iterable):
|
||||||
return sorted(json.loads(json.dumps(list(iterable), cls=CustomJSONEncoder)), key=lambda x: x['date'])
|
return sorted(json.loads(json.dumps(list(iterable), cls=CustomJSONEncoder)),
|
||||||
|
key=lambda x: x['date'])
|
||||||
|
|
||||||
|
|
||||||
def _generate_tests():
|
def _generate_tests():
|
||||||
|
|
|
||||||
27
update.py
27
update.py
|
|
@ -1,9 +1,14 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
"""Script for updating data. """
|
||||||
|
|
||||||
|
import argparse
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
from datetime import datetime, timedelta, tzinfo
|
from datetime import datetime, timedelta, tzinfo
|
||||||
|
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
from fetch_holidays import CustomJSONEncoder, fetch_holiday
|
from fetch_holidays import CustomJSONEncoder, fetch_holiday
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -28,7 +33,16 @@ def _file_path(*other):
|
||||||
return os.path.join(__dirname__, *other)
|
return os.path.join(__dirname__, *other)
|
||||||
|
|
||||||
|
|
||||||
def update_data(year):
|
def update_data(year: int) -> str:
|
||||||
|
"""Update and store data for a year.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
year (int): Year
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Stored data path
|
||||||
|
"""
|
||||||
|
|
||||||
filename = _file_path(f'{year}.json')
|
filename = _file_path(f'{year}.json')
|
||||||
with open(filename, 'w', encoding='utf-8', newline='\n') as f:
|
with open(filename, 'w', encoding='utf-8', newline='\n') as f:
|
||||||
json.dump(fetch_holiday(year), f,
|
json.dump(fetch_holiday(year), f,
|
||||||
|
|
@ -39,11 +53,18 @@ def update_data(year):
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('--all', action='store_true')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
now = datetime.now(ChinaTimezone())
|
now = datetime.now(ChinaTimezone())
|
||||||
|
|
||||||
filenames = []
|
filenames = []
|
||||||
filenames.append(update_data(now.year))
|
progress = tqdm(range(2014 if args.all else now.year, now.year + 2))
|
||||||
filenames.append(update_data(now.year + 1))
|
for i in progress:
|
||||||
|
progress.set_description(f'Updating {i} data')
|
||||||
|
filename = update_data(i)
|
||||||
|
filenames.append(filename)
|
||||||
|
|
||||||
subprocess.run(['git', 'add', *filenames], check=True)
|
subprocess.run(['git', 'add', *filenames], check=True)
|
||||||
diff = subprocess.run(['git', 'diff', '--stat', '--cached'],
|
diff = subprocess.run(['git', 'diff', '--stat', '--cached'],
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user