From 4bcf0aed87ab491304407d4435e6c30003187740 Mon Sep 17 00:00:00 2001 From: NateScarlet Date: Wed, 6 Mar 2019 21:41:31 +0800 Subject: [PATCH] Prepare test data --- description_parsing_cases.json | 295 +++++++++++++++++++++++++++++++++ fetch_holidays.py | 69 ++++++++ requirements.txt | 2 + test_fetch_holidays.py | 24 +++ 4 files changed, 390 insertions(+) create mode 100644 description_parsing_cases.json create mode 100644 fetch_holidays.py create mode 100644 requirements.txt create mode 100644 test_fetch_holidays.py diff --git a/description_parsing_cases.json b/description_parsing_cases.json new file mode 100644 index 0000000..8f62d0a --- /dev/null +++ b/description_parsing_cases.json @@ -0,0 +1,295 @@ +[ + { + "year": 2019, + "description": "2018年12月30日至2019年1月1日放假调休,共3天。2018年12月29日(星期六)上班。", + "expected": [ + { "date": "2018-12-30", "isOffDay": true }, + { "date": "2018-12-31", "isOffDay": true }, + { "date": "2019-01-01", "isOffDay": true }, + { "date": "2019-12-29", "isOffDay": false } + ] + }, + { + "year": 2019, + "description": "2月4日至10日放假调休,共7天。2月2日(星期六)、2月3日(星期日)上班。", + "expected": [ + { "date": "2019-02-04", "isOffDay": true }, + { "date": "2019-02-05", "isOffDay": true }, + { "date": "2019-02-06", "isOffDay": true }, + { "date": "2019-02-07", "isOffDay": true }, + { "date": "2019-02-08", "isOffDay": true }, + { "date": "2019-02-09", "isOffDay": true }, + { "date": "2019-02-02", "isOffDay": false }, + { "date": "2019-02-03", "isOffDay": false } + ] + }, + { + "year": 2019, + "description": "4月5日放假,与周末连休。", + "expected": [{ "date": "2019-04-05", "isOffDay": true }] + }, + { + "year": 2019, + "description": "5月1日放假。", + "expected": [{ "date": "2019-05-01", "isOffDay": true }] + }, + { + "year": 2019, + "description": "6月7日放假,与周末连休。", + "expected": [{ "date": "2019-06-07", "isOffDay": true }] + }, + { + "year": 2019, + "description": "9月13日放假,与周末连休。", + "expected": [{ "date": "2019-09-13", "isOffDay": true }] + }, + { + "year": 2019, + "description": "10月1日至7日放假调休,共7天。9月29日(星期日)、10月12日(星期六)上班。", + "expected": [ + { "date": "2019-10-01", "isOffDay": true }, + { "date": "2019-10-02", "isOffDay": true }, + { "date": "2019-10-03", "isOffDay": true }, + { "date": "2019-10-04", "isOffDay": true }, + { "date": "2019-10-05", "isOffDay": true }, + { "date": "2019-10-06", "isOffDay": true }, + { "date": "2019-10-07", "isOffDay": true }, + { "date": "2019-09-29", "isOffDay": false }, + { "date": "2019-10-12", "isOffDay": false } + ] + }, + { + "year": 2018, + "description": "1月1日放假,与周末连休。", + "expected": [{ "date": "2018-01-01", "isOffDay": true }] + }, + { + "year": 2018, + "description": "2月15日至21日放假调休,共7天。2月11日(星期日)、2月24日(星期六)上班。", + "expected": [ + { "date": "2018-02-15", "isOffDay": true }, + { "date": "2018-02-16", "isOffDay": true }, + { "date": "2018-02-17", "isOffDay": true }, + { "date": "2018-02-18", "isOffDay": true }, + { "date": "2018-02-19", "isOffDay": true }, + { "date": "2018-02-20", "isOffDay": true }, + { "date": "2018-02-21", "isOffDay": true }, + { "date": "2018-02-11", "isOffDay": false }, + { "date": "2018-02-24", "isOffDay": false } + ] + }, + { + "year": 2018, + "description": "4月5日至7日放假调休,共3天。4月8日(星期日)上班。", + "expected": [ + { "date": "2018-04-05", "isOffDay": true }, + { "date": "2018-04-06", "isOffDay": true }, + { "date": "2018-04-07", "isOffDay": true }, + { "date": "2018-04-08", "isOffDay": false } + ] + }, + { + "year": 2018, + "description": "4月29日至5月1日放假调休,共3天。4月28日(星期六)上班。", + "expected": [ + { "date": "2018-04-29", "isOffDay": true }, + { "date": "2018-04-30", "isOffDay": true }, + { "date": "2018-05-01", "isOffDay": true }, + { "date": "2018-04-28", "isOffDay": false } + ] + }, + { + "year": 2018, + "description": "6月18日放假,与周末连休。", + "expected": [{ "date": "2018-06-18", "isOffDay": true }] + }, + { + "year": 2018, + "description": "9月24日放假,与周末连休。", + "expected": [{ "date": "2018-09-24", "isOffDay": true }] + }, + { + "year": 2018, + "description": "10月1日至7日放假调休,共7天。9月29日(星期六)、9月30日(星期日)上班。", + "expected": [ + { "date": "2018-10-01", "isOffDay": true }, + { "date": "2018-10-02", "isOffDay": true }, + { "date": "2018-10-03", "isOffDay": true }, + { "date": "2018-10-04", "isOffDay": true }, + { "date": "2018-10-05", "isOffDay": true }, + { "date": "2018-10-06", "isOffDay": true }, + { "date": "2018-10-07", "isOffDay": true }, + { "date": "2018-09-29", "isOffDay": false }, + { "date": "2018-09-30", "isOffDay": false } + ] + }, + { + "year": 2013, + "description": "1月1日至3日放假调休,共3天。1月5日(星期六)、1月6日(星期日)上班。", + "expected": [ + { "date": "2013-01-01", "isOffDay": true }, + { "date": "2013-01-02", "isOffDay": true }, + { "date": "2013-01-03", "isOffDay": true }, + { "date": "2013-01-05", "isOffDay": false }, + { "date": "2013-01-06", "isOffDay": false } + ] + }, + { + "year": 2013, + "description": "2月9日至15日放假调休,共7天。2月16日(星期六)、2月17日(星期日)上班。", + "expected": [ + { "date": "2013-02-09", "isOffDay": true }, + { "date": "2013-02-10", "isOffDay": true }, + { "date": "2013-02-11", "isOffDay": true }, + { "date": "2013-02-12", "isOffDay": true }, + { "date": "2013-02-13", "isOffDay": true }, + { "date": "2013-02-14", "isOffDay": true }, + { "date": "2013-02-15", "isOffDay": true }, + { "date": "2013-02-16", "isOffDay": false }, + { "date": "2013-02-17", "isOffDay": false } + ] + }, + { + "year": 2013, + "description": "4月4日至6日放假调休,共3天。4月7日(星期日)上班。", + "expected": [ + { "date": "2013-04-04", "isOffDay": true }, + { "date": "2013-04-05", "isOffDay": true }, + { "date": "2013-04-06", "isOffDay": true }, + { "date": "2013-04-07", "isOffDay": false } + ] + }, + { + "year": 2013, + "description": "4月29日至5月1日放假调休,共3天。4月27日(星期六)、4月28日(星期日)上班。", + "expected": [ + { "date": "2013-04-29", "isOffDay": true }, + { "date": "2013-04-30", "isOffDay": true }, + { "date": "2013-05-01", "isOffDay": true }, + { "date": "2013-04-27", "isOffDay": false }, + { "date": "2013-04-28", "isOffDay": false } + ] + }, + { + "year": 2013, + "description": "6月10日至12日放假调休,共3天。6月8日(星期六)、6月9日(星期日)上班。", + "expected": [ + { "date": "2013-06-10", "isOffDay": true }, + { "date": "2013-06-11", "isOffDay": true }, + { "date": "2013-06-12", "isOffDay": true }, + { "date": "2013-06-08", "isOffDay": false }, + { "date": "2013-06-09", "isOffDay": false } + ] + }, + { + "year": 2013, + "description": "9月19日至21日放假调休,共3天。9月22日(星期日)上班。", + "expected": [ + { "date": "2013-09-19", "isOffDay": true }, + { "date": "2013-09-20", "isOffDay": true }, + { "date": "2013-09-21", "isOffDay": true }, + { "date": "2013-09-22", "isOffDay": false } + ] + }, + { + "year": 2013, + "description": "10月1日至7日放假调休,共7天。9月29日(星期日)、10月12日(星期六)上班。", + "expected": [ + { "date": "2013-10-01", "isOffDay": true }, + { "date": "2013-10-02", "isOffDay": true }, + { "date": "2013-10-03", "isOffDay": true }, + { "date": "2013-10-04", "isOffDay": true }, + { "date": "2013-10-05", "isOffDay": true }, + { "date": "2013-10-06", "isOffDay": true }, + { "date": "2013-10-07", "isOffDay": true }, + { "date": "2013-09-29", "isOffDay": false }, + { "date": "2013-10-12", "isOffDay": false } + ] + }, + { + "year": 2008, + "description": "2007年12月30日—2008年1月1日放假,共3天。其中,1月1日(星期二)为法定节假日,12月30日(星期日)为公休日,12月29日(星期六)公休日调至12月31日(星期一),12月29日(星期六)上班。", + "expected": [ + { "date": "2007-12-30", "isOffDay": true }, + { "date": "2007-12-31", "isOffDay": true }, + { "date": "2008-01-01", "isOffDay": true }, + { "date": "2019-12-29", "isOffDay": false } + ] + }, + { + "year": 2008, + "description": "2月6日—12日(农历除夕至正月初六)放假,共7天。其中,2月6日(除夕)、2月7日(春节)、2月8日(正月初二)为法定节假日,2月9日(星期六)、2月10日(星期日)照常公休,2月2日(星期六)、2月3日(星期日)两个公休日调至2月11日(星期一)、2月12日(星期二),2月2日(星期六)、2月3日(星期日)上班。", + "expected": [ + { "date": "2008-02-06", "isOffDay": true }, + { "date": "2008-02-07", "isOffDay": true }, + { "date": "2008-02-08", "isOffDay": true }, + { "date": "2008-02-09", "isOffDay": true }, + { "date": "2008-02-10", "isOffDay": true }, + { "date": "2008-02-11", "isOffDay": true }, + { "date": "2008-02-12", "isOffDay": true }, + { "date": "2008-02-02", "isOffDay": false }, + { "date": "2008-02-03", "isOffDay": false } + ] + }, + { + "year": 2008, + "description": "4月4日—6日放假,共3天。其中,4月4日(清明节)为法定节假日,4月5日(星期六)、4月6日(星期日)照常公休。", + "expected": [ + { "date": "2008-04-04", "isOffDay": true }, + { "date": "2008-04-05", "isOffDay": true }, + { "date": "2008-04-06", "isOffDay": true } + ] + }, + { + "year": 2008, + "description": "5月1日—3日放假,共3天。其中,5月1日为法定节假日,5月3日(星期六)为公休日,5月4日(星期日)公休日调至5月2日(星期五),5月4日(星期日)上班。", + "expected": [ + { "date": "2008-05-01", "isOffDay": true }, + { "date": "2008-05-02", "isOffDay": true }, + { "date": "2008-05-03", "isOffDay": true }, + { "date": "2008-05-04", "isOffDay": false } + ] + }, + { + "year": 2008, + "description": "6月7日—9日放假,共3天。其中,6月7日(星期六)照常公休,6月8日(农历五月初五,端午节)为法定节假日,6月8日(星期日)公休日调至6月9日(星期一)。", + "expected": [ + { "date": "2008-06-07", "isOffDay": true }, + { "date": "2008-06-08", "isOffDay": true }, + { "date": "2008-06-09", "isOffDay": true } + ] + }, + { + "year": 2008, + "description": "9月13日—15日放假,共3天。其中,9月13日(星期六)为公休日,9月14日(农历八月十五,中秋节)为法定节假日,9月14日(星期日)公休日调至9月15日(星期一)。", + "expected": [ + { "date": "2008-09-13", "isOffDay": true }, + { "date": "2008-09-14", "isOffDay": true }, + { "date": "2008-09-15", "isOffDay": true } + ] + }, + { + "year": 2008, + "description": "9月13日—15日放假,共3天。其中,9月13日(星期六)为公休日,9月14日(农历八月十五,中秋节)为法定节假日,9月14日(星期日)公休日调至9月15日(星期一)。", + "expected": [ + { "date": "2008-09-13", "isOffDay": true }, + { "date": "2008-09-14", "isOffDay": true }, + { "date": "2008-09-15", "isOffDay": true } + ] + }, + { + "year": 2008, + "description": "9月29日—10月5日放假,共7天。其中,10月1日、2日、3日为法定节假日,9月27日(星期六)、9月28日(星期日)两个公休日调至9月29日(星期一)、30日(星期二),10月4日(星期六)、5日(星期日)照常公休。", + "expected": [ + { "date": "2008-09-29", "isOffDay": true }, + { "date": "2008-09-30", "isOffDay": true }, + { "date": "2008-10-01", "isOffDay": true }, + { "date": "2008-10-02", "isOffDay": true }, + { "date": "2008-10-03", "isOffDay": true }, + { "date": "2008-10-04", "isOffDay": true }, + { "date": "2008-10-05", "isOffDay": true }, + { "date": "2008-09-27", "isOffDay": false }, + { "date": "2008-09-28", "isOffDay": false } + ] + } +] diff --git a/fetch_holidays.py b/fetch_holidays.py new file mode 100644 index 0000000..61434bf --- /dev/null +++ b/fetch_holidays.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +"""Fetch holidays from gov.cn """ + +import argparse +import re + +import bs4 +import requests + +SEARCH_URL = ('http://sousuo.gov.cn/s.htm' + '?t=paper&advance=true&sort=&title={year}+%E8%8A%82%E5%81%87%E6%97%A5' + '&puborg=%E5%9B%BD%E5%8A%A1%E9%99%A2%E5%8A%9E%E5%85%AC%E5%8E%85' + '&pcodeJiguan=%E5%9B%BD%E5%8A%9E%E5%8F%91%E6%98%8E%E7%94%B5') + + +def get_paper_urls(year): + url = SEARCH_URL.format(year=year) + body = requests.get(url).text + ret = re.findall( + r'
  • ', body, flags=re.S) + assert all( + re.match( + r'http://www.gov.cn/zhengce/content/\d{4}-\d{2}/\d{2}/content_\d+.htm', i) + for i in ret), 'Site changed, need human verify' + + return ret + + +def get_paper(url): + response = requests.get(url) + response.encoding = 'utf-8' + soup = bs4.BeautifulSoup(response.text, features='html.parser') + container = soup.find('td', class_='b12c') + assert container, f'Can not get paper container from url: {url}' + ret = container.get_text() + assert ret, f'Can not get paper context from url: {url}' + return ret + + +def get_rules(paper: str): + lines: list = paper.splitlines() + for i in sorted(set(lines), key=lines.index): + match = re.match(r'[一二三四五六七八九十]、(.+?):(.+)', i) + if match: + yield match.groups() + + +def parse_holiday_description(year, description): + pass + + +def parse_paper(url): + pass + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('year') + args = parser.parse_args() + + papers = get_paper_urls(args.year) + + for i in papers: + paper = get_paper(i) + [print(i) for i in get_rules(paper)] + + +if __name__ == '__main__': + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..792285a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +requests ~= 2.21.0 +beautifulsoup4 ~= 4.7.1 \ No newline at end of file diff --git a/test_fetch_holidays.py b/test_fetch_holidays.py new file mode 100644 index 0000000..c2c20c4 --- /dev/null +++ b/test_fetch_holidays.py @@ -0,0 +1,24 @@ + +import json +import sys + +from fetch_holidays import parse_holiday_description + + +def _generate_tests(): + with open('description_parsing_cases.json', 'r', encoding='utf-8', ) as f: + cases = json.load(f) + + def create_test(case): + def _test(): + year, description, expected = case['year'], case['description'], case['expected'] + assert parse_holiday_description( + year, description) == expected, case + return _test + + for index, case in enumerate(cases, 1): + setattr(sys.modules[__name__], + f'test_parse_holiday_description_{index}', create_test(case)) + + +_generate_tests()