From 9324d6eaafd7d2ebe495a8f9cbe4501997c40c8a Mon Sep 17 00:00:00 2001 From: NateScarlet Date: Sat, 9 Mar 2019 01:10:53 +0800 Subject: [PATCH] Handle \u3000 used in 2016 --- 2016.json | Bin 132 -> 7244 bytes fetch_holidays.py | 8 ++++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/2016.json b/2016.json index adbec9de269420f7e46cdd2a4682f723ffd8156b..ba229abd2910c35007091da4f2371ad485f1d824 100644 GIT binary patch literal 7244 zcmd^^%Sr<=6o&te8$q8z0`BZ(rWdVSUAR)L;6|#H+FC)|Qb(ngZo2SU-1`{5hg+Y( z%b!d~r}Z+J2}8;dNJu!GoG+P^a}M+Vc84V_Vr9|<256#=J_I(o<{}RVWvtCe^(fP$ zWFJ=($)hQWFMVxzY&~krvE8bc#TmM|K^6^kxH?3ex;k8KaLlpqa^|Tuk8|IO5;C;K z<=mlFvr;E&wSWqW)GK$rq-9~Y+Bb?*QVb#prq3tf3PwRY>)fe1IR$~O{pY5b*(s0? zMgh6A^XM1L{!6*q8jqvx`-{@$ zZRYu9MQiz)vPI$x^3Mn7x;E!L-R4FrC@U$TlrOBVXU@+T>o8`dfznL{Wg!)mMQzGt zBIZJ8Id*PPRJnmZX%a8cWA&jLHwTg#t<0}&euK<$+WiW>40jIV-e+0er``F1DNfrn z1aq8r`=>cheg8}*vSfCj25~*KLJxINviij}$EjakClNu-aoSy7o8#24uaoi6*Tehc zcpfUFiv2l}IZpjIHwh0l$7y$MW{%VDn#CNa-4WCrr`-|M9H-q8)EsAeBB*{98kH-A N|0~thU#5}jKLE~(CQUC&_%GAUnB`XCZ14A<%h)6+VL26MkP%IkE2dl~`DJihh*Do(G*GtbY(@W0N zugXZxOHWSKPtMOPNzE(K2O40iYiOizXaEz6H#9J|G||f_$yEXy6bm&!C9x80dMsBh E00LMeTmS$7 diff --git a/fetch_holidays.py b/fetch_holidays.py index 47b2e26..5739c9b 100755 --- a/fetch_holidays.py +++ b/fetch_holidays.py @@ -44,18 +44,23 @@ def get_paper(url): soup = bs4.BeautifulSoup(response.text, features='html.parser') container = soup.find('td', class_='b12c') assert container, f'Can not get paper container from url: {url}' - ret = container.get_text() + ret = container.get_text().replace('\u3000', '\n') assert ret, f'Can not get paper context from url: {url}' return ret def get_rules(paper: str): lines: list = paper.splitlines() + count = 0 for i in sorted(set(lines), key=lines.index): match = re.match(r'[一二三四五六七八九十]、(.+?):(.+)', i) if match: + count += 1 yield match.groups() + if not count: + raise NotImplementedError(lines) + def _cast_int(value): return int(value) if value else None @@ -227,7 +232,6 @@ def fetch_holiday(year: int): 'name': name, **j } for j in DescriptionParser(description).parse(year)) - return { 'year': year, 'papers': papers,