Handle \u3000 used in 2016

This commit is contained in:
NateScarlet 2019-03-09 01:10:53 +08:00
parent ede5f2050f
commit 9324d6eaaf
No known key found for this signature in database
GPG Key ID: 5C242793B070309C
2 changed files with 6 additions and 2 deletions

BIN
2016.json

Binary file not shown.

View File

@ -44,18 +44,23 @@ def get_paper(url):
soup = bs4.BeautifulSoup(response.text, features='html.parser')
container = soup.find('td', class_='b12c')
assert container, f'Can not get paper container from url: {url}'
ret = container.get_text()
ret = container.get_text().replace('\u3000', '\n')
assert ret, f'Can not get paper context from url: {url}'
return ret
def get_rules(paper: str):
lines: list = paper.splitlines()
count = 0
for i in sorted(set(lines), key=lines.index):
match = re.match(r'[一二三四五六七八九十]、(.+?)(.+)', i)
if match:
count += 1
yield match.groups()
if not count:
raise NotImplementedError(lines)
def _cast_int(value):
return int(value) if value else None
@ -227,7 +232,6 @@ def fetch_holiday(year: int):
'name': name,
**j
} for j in DescriptionParser(description).parse(year))
return {
'year': year,
'papers': papers,