Handle \u3000 used in 2016
This commit is contained in:
parent
ede5f2050f
commit
9324d6eaaf
|
|
@ -44,18 +44,23 @@ def get_paper(url):
|
||||||
soup = bs4.BeautifulSoup(response.text, features='html.parser')
|
soup = bs4.BeautifulSoup(response.text, features='html.parser')
|
||||||
container = soup.find('td', class_='b12c')
|
container = soup.find('td', class_='b12c')
|
||||||
assert container, f'Can not get paper container from url: {url}'
|
assert container, f'Can not get paper container from url: {url}'
|
||||||
ret = container.get_text()
|
ret = container.get_text().replace('\u3000', '\n')
|
||||||
assert ret, f'Can not get paper context from url: {url}'
|
assert ret, f'Can not get paper context from url: {url}'
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
|
||||||
def get_rules(paper: str):
|
def get_rules(paper: str):
|
||||||
lines: list = paper.splitlines()
|
lines: list = paper.splitlines()
|
||||||
|
count = 0
|
||||||
for i in sorted(set(lines), key=lines.index):
|
for i in sorted(set(lines), key=lines.index):
|
||||||
match = re.match(r'[一二三四五六七八九十]、(.+?):(.+)', i)
|
match = re.match(r'[一二三四五六七八九十]、(.+?):(.+)', i)
|
||||||
if match:
|
if match:
|
||||||
|
count += 1
|
||||||
yield match.groups()
|
yield match.groups()
|
||||||
|
|
||||||
|
if not count:
|
||||||
|
raise NotImplementedError(lines)
|
||||||
|
|
||||||
|
|
||||||
def _cast_int(value):
|
def _cast_int(value):
|
||||||
return int(value) if value else None
|
return int(value) if value else None
|
||||||
|
|
@ -227,7 +232,6 @@ def fetch_holiday(year: int):
|
||||||
'name': name,
|
'name': name,
|
||||||
**j
|
**j
|
||||||
} for j in DescriptionParser(description).parse(year))
|
} for j in DescriptionParser(description).parse(year))
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'year': year,
|
'year': year,
|
||||||
'papers': papers,
|
'papers': papers,
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user