From 350ceab783c2bc3ab1f7e5b2991b6e761542f226 Mon Sep 17 00:00:00 2001 From: NateScarlet Date: Sat, 9 Mar 2019 12:27:41 +0800 Subject: [PATCH] Refactor code --- 2016.json | Bin 7244 -> 3622 bytes fetch_holidays.py | 76 ++++++++++++++++++++++++++++++++--------- requirements.txt | 3 +- test_fetch_holidays.py | 5 +-- update.py | 27 +++++++++++++-- 5 files changed, 88 insertions(+), 23 deletions(-) diff --git a/2016.json b/2016.json index ba229abd2910c35007091da4f2371ad485f1d824..0073de810de7027fe84176dcfd6136fc9514686b 100644 GIT binary patch literal 3622 zcmb>CQUC&_%GAUnB`XCZ14A<%h)6+VL26MkP%IkE2dl~`DJihh*Do(G*GtbY(@W0N zugXZxOHWSKPtMOPNzE(K2O40iYiOizXaEz6H#9J|G||f_$yEXy6bm&!C9x9K^lG?0 zV7u}Xb5nt~D?M#(el~rX5=0kB1_YE+5=+3cAP4Ij7y=Qxip*mFv^1B*N}w4fMWv~5 z!)p;H;jr=9jHNHSn$XQdwb4k|zzkgll8tGJImI-v(j1SKwDF_`-EFj_w~dAdbhnZ2 ziPg}E?tV0;yNxDzY$Q2D?r42BqmN)QV4`b)&sGxiQZWrZ zU%mcmZwEo20*iJk76(LIO3f01cuT2RC=hQczBnZ{&pg?&_WAN|f{|*eYiNomr;?aw zh_@79eN8h|qm3T|3m{EzhbYL1c!2cSxM~4M95m(t literal 7244 zcmd^^%Sr<=6o&te8$q8z0`BZ(rWdVSUAR)L;6|#H+FC)|Qb(ngZo2SU-1`{5hg+Y( z%b!d~r}Z+J2}8;dNJu!GoG+P^a}M+Vc84V_Vr9|<256#=J_I(o<{}RVWvtCe^(fP$ zWFJ=($)hQWFMVxzY&~krvE8bc#TmM|K^6^kxH?3ex;k8KaLlpqa^|Tuk8|IO5;C;K z<=mlFvr;E&wSWqW)GK$rq-9~Y+Bb?*QVb#prq3tf3PwRY>)fe1IR$~O{pY5b*(s0? zMgh6A^XM1L{!6*q8jqvx`-{@$ zZRYu9MQiz)vPI$x^3Mn7x;E!L-R4FrC@U$TlrOBVXU@+T>o8`dfznL{Wg!)mMQzGt zBIZJ8Id*PPRJnmZX%a8cWA&jLHwTg#t<0}&euK<$+WiW>40jIV-e+0er``F1DNfrn z1aq8r`=>cheg8}*vSfCj25~*KLJxINviij}$EjakClNu-aoSy7o8#24uaoi6*Tehc zcpfUFiv2l}IZpjIHwh0l$7y$MW{%VDn#CNa-4WCrr`-|M9H-q8)EsAeBB*{98kH-A N|0~thU#5}jKLE~( List[str]: body = requests.get(url).text ret = re.findall( r'
  • ', body, flags=re.S) - assert all( - re.match( - r'http://www.gov.cn/zhengce/content/\d{4}-\d{2}/\d{2}/content_\d+.htm', i) - for i in ret), 'Site changed, need human verify' return ret -def get_paper(url): +def get_paper(url: str) -> str: + """Extract paper text from url. + + Args: + url (str): Paper url. + + Returns: + str: Extracted paper text. + """ + + assert re.match(r'http://www.gov.cn/zhengce/content/\d{4}-\d{2}/\d{2}/content_\d+.htm', + url), 'Site changed, need human verify' + response = requests.get(url) response.encoding = 'utf-8' soup = bs4.BeautifulSoup(response.text, features='html.parser') container = soup.find('td', class_='b12c') assert container, f'Can not get paper container from url: {url}' - ret = container.get_text().replace('\u3000', '\n') + ret = container.get_text().replace('\u3000\u3000', '\n') assert ret, f'Can not get paper context from url: {url}' return ret -def get_rules(paper: str): +def get_rules(paper: str) -> Iterator[Tuple[str, str]]: + """Extract rules from paper. + + Args: + paper (str): Paper text + + Raises: + NotImplementedError: When find no rules. + + Returns: + Iterator[Tuple[str, str]]: (name, description) + """ + lines: list = paper.splitlines() count = 0 for i in sorted(set(lines), key=lines.index): @@ -74,11 +94,27 @@ class SentenceParser: self.year = year self._date_memory = set() - def extract_dates(self, value) -> List[date]: + def extract_dates(self, text: str) -> Iterator[date]: + """Extract date from text. + + Args: + text (str): Text to extract + + Returns: + Iterator[date]: Extracted dates. + """ + + count = 0 for method in self.date_extraction_methods: - for i in method(self, value): - if i not in self._date_memory: - yield i + for i in method(self, text): + count += 1 + if i in self._date_memory: + continue + self._date_memory.add(i) + yield i + + if not count: + raise NotImplementedError(text) def get_date(self, year: Optional[int], month: int, day: int) -> date: """Get date in context. @@ -147,12 +183,19 @@ class SentenceParser: _extract_dates_3 ] - def parse(self, memory): + def parse(self, memory: set) -> Iterator[dict]: + """Parse days with memory + + Args: + memory (set): Date memory + + Returns: + Iterator[dict]: Days without name field. + """ + self._date_memory = memory for method in self.parsing_methods: for i in method(self): - if i['date'] in self._date_memory: - continue yield i def _parse_rest_1(self): @@ -201,7 +244,7 @@ class DescriptionParser: self.description = description self._date_memory = set() - def parse(self, year: int): + def parse(self, year: int) -> Iterator[dict]: """Generator for description parsing result. Args: @@ -211,7 +254,6 @@ class DescriptionParser: self._date_memory.clear() for i in re.split(',|。', self.description): for j in SentenceParser(i, year).parse(self._date_memory): - self._date_memory.add(j['date']) yield j if not self._date_memory: diff --git a/requirements.txt b/requirements.txt index 792285a..4f0073b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ requests ~= 2.21.0 -beautifulsoup4 ~= 4.7.1 \ No newline at end of file +beautifulsoup4 ~= 4.7.1 +tqdm ~= 4.30.0 diff --git a/test_fetch_holidays.py b/test_fetch_holidays.py index f8a1e57..b261971 100644 --- a/test_fetch_holidays.py +++ b/test_fetch_holidays.py @@ -1,4 +1,4 @@ - +"""Test module `fetch_holidays`. """ import json import sys @@ -6,7 +6,8 @@ from fetch_holidays import CustomJSONEncoder, DescriptionParser def _normalize(iterable): - return sorted(json.loads(json.dumps(list(iterable), cls=CustomJSONEncoder)), key=lambda x: x['date']) + return sorted(json.loads(json.dumps(list(iterable), cls=CustomJSONEncoder)), + key=lambda x: x['date']) def _generate_tests(): diff --git a/update.py b/update.py index 1106ddd..d9173ce 100755 --- a/update.py +++ b/update.py @@ -1,9 +1,14 @@ #!/usr/bin/env python3 +"""Script for updating data. """ + +import argparse import json import os import subprocess from datetime import datetime, timedelta, tzinfo +from tqdm import tqdm + from fetch_holidays import CustomJSONEncoder, fetch_holiday @@ -28,7 +33,16 @@ def _file_path(*other): return os.path.join(__dirname__, *other) -def update_data(year): +def update_data(year: int) -> str: + """Update and store data for a year. + + Args: + year (int): Year + + Returns: + str: Stored data path + """ + filename = _file_path(f'{year}.json') with open(filename, 'w', encoding='utf-8', newline='\n') as f: json.dump(fetch_holiday(year), f, @@ -39,11 +53,18 @@ def update_data(year): def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--all', action='store_true') + args = parser.parse_args() + now = datetime.now(ChinaTimezone()) filenames = [] - filenames.append(update_data(now.year)) - filenames.append(update_data(now.year + 1)) + progress = tqdm(range(2014 if args.all else now.year, now.year + 2)) + for i in progress: + progress.set_description(f'Updating {i} data') + filename = update_data(i) + filenames.append(filename) subprocess.run(['git', 'add', *filenames], check=True) diff = subprocess.run(['git', 'diff', '--stat', '--cached'],