Implement paper parsing
This commit is contained in:
parent
68e3d12722
commit
3311176755
|
|
@ -62,23 +62,38 @@ def _cast_int(value):
|
||||||
|
|
||||||
|
|
||||||
class SentenceParser:
|
class SentenceParser:
|
||||||
"""Parser for rule sentence. """
|
"""Parser for holiday shift description sentence. """
|
||||||
|
|
||||||
def __init__(self, sentence, year):
|
def __init__(self, sentence, year):
|
||||||
self.sentence = sentence
|
self.sentence = sentence
|
||||||
self.year = year
|
self.year = year
|
||||||
|
self._date_memory = set()
|
||||||
|
|
||||||
def extract_dates(self, value) -> List[date]:
|
def extract_dates(self, value) -> List[date]:
|
||||||
memory = set()
|
|
||||||
for method in self.date_extraction_methods:
|
for method in self.date_extraction_methods:
|
||||||
for i in method(self, value):
|
for i in method(self, value):
|
||||||
if i not in memory:
|
if i not in self._date_memory:
|
||||||
memory.add(i)
|
|
||||||
yield i
|
yield i
|
||||||
|
|
||||||
def get_date(self, year: Optional[int], month: int, day: int) -> date:
|
def get_date(self, year: Optional[int], month: int, day: int) -> date:
|
||||||
if year is None and month > 10:
|
"""Get date in context.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
year (Optional[int]): year
|
||||||
|
month (int): month
|
||||||
|
day (int): day
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
date: Date result
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Special case: 12 month may mean previous year
|
||||||
|
if (year is None
|
||||||
|
and month == 12
|
||||||
|
and self._date_memory
|
||||||
|
and max(self._date_memory) < date(year=self.year, month=2, day=1)):
|
||||||
year = self.year - 1
|
year = self.year - 1
|
||||||
|
|
||||||
year = year or self.year
|
year = year or self.year
|
||||||
return date(year=year, month=month, day=day)
|
return date(year=year, month=month, day=day)
|
||||||
|
|
||||||
|
|
@ -104,7 +119,9 @@ class SentenceParser:
|
||||||
|
|
||||||
def _extract_dates_3(self, value):
|
def _extract_dates_3(self, value):
|
||||||
match = re.match(
|
match = re.match(
|
||||||
r'(?:(\d+)年)?(?:(\d+)月)(\d+)日(?:([^)]+))?(?:、(?:(\d+)年)?(?:(\d+)月)?(\d+)日(?:([^)]+))?)+', value)
|
r'(?:(\d+)年)?(?:(\d+)月)(\d+)日(?:([^)]+))?'
|
||||||
|
r'(?:、(?:(\d+)年)?(?:(\d+)月)?(\d+)日(?:([^)]+))?)+',
|
||||||
|
value.replace('(', '(').replace(')', ')'))
|
||||||
if match:
|
if match:
|
||||||
groups = [_cast_int(i) for i in match.groups()]
|
groups = [_cast_int(i) for i in match.groups()]
|
||||||
assert not (len(groups) % 3), groups
|
assert not (len(groups) % 3), groups
|
||||||
|
|
@ -125,14 +142,12 @@ class SentenceParser:
|
||||||
_extract_dates_3
|
_extract_dates_3
|
||||||
]
|
]
|
||||||
|
|
||||||
def parse(self):
|
def parse(self, memory):
|
||||||
date_memory = set()
|
self._date_memory = memory
|
||||||
for method in self.parsing_methods:
|
for method in self.parsing_methods:
|
||||||
|
|
||||||
for i in method(self):
|
for i in method(self):
|
||||||
if i['date'] in date_memory:
|
if i['date'] in self._date_memory:
|
||||||
continue
|
continue
|
||||||
date_memory.add(i['date'])
|
|
||||||
yield i
|
yield i
|
||||||
|
|
||||||
def _parse_rest_1(self):
|
def _parse_rest_1(self):
|
||||||
|
|
@ -174,13 +189,24 @@ class SentenceParser:
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def parse_holiday_description(description: str, year: int):
|
class DescriptionParser:
|
||||||
date_memory = set()
|
"""Parser for holiday shift description. """
|
||||||
for i in re.split(',|。', description):
|
|
||||||
for j in SentenceParser(i, year).parse():
|
def __init__(self, description):
|
||||||
if j['date'] in date_memory:
|
self.description = description
|
||||||
continue
|
self._date_memory = set()
|
||||||
date_memory.add(j['date'])
|
|
||||||
|
def parse(self, year: int):
|
||||||
|
"""Generator for description parsing result.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
year (int): Context year
|
||||||
|
"""
|
||||||
|
|
||||||
|
self._date_memory.clear()
|
||||||
|
for i in re.split(',|。', self.description):
|
||||||
|
for j in SentenceParser(i, year).parse(self._date_memory):
|
||||||
|
self._date_memory.add(j['date'])
|
||||||
yield j
|
yield j
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -199,9 +225,14 @@ def main():
|
||||||
ret.extend({
|
ret.extend({
|
||||||
'name': name,
|
'name': name,
|
||||||
**j
|
**j
|
||||||
} for j in parse_holiday_description(description, year))
|
} for j in DescriptionParser(description).parse(year))
|
||||||
|
|
||||||
print(json.dumps(ret, indent=4, ensure_ascii=False, cls=CustomJSONEncoder))
|
result = {
|
||||||
|
'year': year,
|
||||||
|
'papers': papers,
|
||||||
|
'days': sorted(ret, key=lambda x: x['date'])
|
||||||
|
}
|
||||||
|
print(json.dumps(result, indent=4, ensure_ascii=False, cls=CustomJSONEncoder))
|
||||||
|
|
||||||
|
|
||||||
class CustomJSONEncoder(json.JSONEncoder):
|
class CustomJSONEncoder(json.JSONEncoder):
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
import json
|
import json
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from fetch_holidays import CustomJSONEncoder, parse_holiday_description
|
from fetch_holidays import CustomJSONEncoder, DescriptionParser
|
||||||
|
|
||||||
|
|
||||||
def _normalize(iterable):
|
def _normalize(iterable):
|
||||||
|
|
@ -16,13 +16,13 @@ def _generate_tests():
|
||||||
def create_test(case):
|
def create_test(case):
|
||||||
def _test():
|
def _test():
|
||||||
year, description, expected = case['year'], case['description'], case['expected']
|
year, description, expected = case['year'], case['description'], case['expected']
|
||||||
assert _normalize(parse_holiday_description(
|
assert _normalize(DescriptionParser(description)
|
||||||
description, year)) == _normalize(expected), case
|
.parse(year)) == _normalize(expected), case
|
||||||
return _test
|
return _test
|
||||||
|
|
||||||
for index, case in enumerate(cases, 1):
|
for index, case in enumerate(cases, 1):
|
||||||
setattr(sys.modules[__name__],
|
setattr(sys.modules[__name__],
|
||||||
f'test_parse_holiday_description_{index}', create_test(case))
|
f'test_description_parser_{index}', create_test(case))
|
||||||
|
|
||||||
|
|
||||||
_generate_tests()
|
_generate_tests()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user