Handle \u3000 used in 2016

This commit is contained in:
NateScarlet 2019-03-09 01:10:53 +08:00
parent ede5f2050f
commit 9324d6eaaf
No known key found for this signature in database
GPG Key ID: 5C242793B070309C
2 changed files with 6 additions and 2 deletions

BIN
2016.json

Binary file not shown.

View File

@ -44,18 +44,23 @@ def get_paper(url):
soup = bs4.BeautifulSoup(response.text, features='html.parser') soup = bs4.BeautifulSoup(response.text, features='html.parser')
container = soup.find('td', class_='b12c') container = soup.find('td', class_='b12c')
assert container, f'Can not get paper container from url: {url}' assert container, f'Can not get paper container from url: {url}'
ret = container.get_text() ret = container.get_text().replace('\u3000', '\n')
assert ret, f'Can not get paper context from url: {url}' assert ret, f'Can not get paper context from url: {url}'
return ret return ret
def get_rules(paper: str): def get_rules(paper: str):
lines: list = paper.splitlines() lines: list = paper.splitlines()
count = 0
for i in sorted(set(lines), key=lines.index): for i in sorted(set(lines), key=lines.index):
match = re.match(r'[一二三四五六七八九十]、(.+?)(.+)', i) match = re.match(r'[一二三四五六七八九十]、(.+?)(.+)', i)
if match: if match:
count += 1
yield match.groups() yield match.groups()
if not count:
raise NotImplementedError(lines)
def _cast_int(value): def _cast_int(value):
return int(value) if value else None return int(value) if value else None
@ -227,7 +232,6 @@ def fetch_holiday(year: int):
'name': name, 'name': name,
**j **j
} for j in DescriptionParser(description).parse(year)) } for j in DescriptionParser(description).parse(year))
return { return {
'year': year, 'year': year,
'papers': papers, 'papers': papers,