chore: move python code to /scripts

This commit is contained in:
NateScarlet
2022-12-09 16:52:02 +08:00
parent df637aa453
commit ef9f9cd5e3
11 changed files with 17 additions and 10 deletions

View File

@@ -0,0 +1,497 @@
[
{
"year": 2019,
"description": "2018年12月30日至2019年1月1日放假调休共3天。2018年12月29日星期六上班。",
"expected": [
{ "date": "2018-12-30", "isOffDay": true },
{ "date": "2018-12-31", "isOffDay": true },
{ "date": "2019-01-01", "isOffDay": true },
{ "date": "2018-12-29", "isOffDay": false }
]
},
{
"year": 2019,
"description": "2月4日至10日放假调休共7天。2月2日星期六、2月3日星期日上班。",
"expected": [
{ "date": "2019-02-04", "isOffDay": true },
{ "date": "2019-02-05", "isOffDay": true },
{ "date": "2019-02-06", "isOffDay": true },
{ "date": "2019-02-07", "isOffDay": true },
{ "date": "2019-02-08", "isOffDay": true },
{ "date": "2019-02-09", "isOffDay": true },
{ "date": "2019-02-10", "isOffDay": true },
{ "date": "2019-02-02", "isOffDay": false },
{ "date": "2019-02-03", "isOffDay": false }
]
},
{
"year": 2019,
"description": "4月5日放假与周末连休。",
"expected": [{ "date": "2019-04-05", "isOffDay": true }]
},
{
"year": 2019,
"description": "5月1日放假。",
"expected": [{ "date": "2019-05-01", "isOffDay": true }]
},
{
"year": 2019,
"description": "6月7日放假与周末连休。",
"expected": [{ "date": "2019-06-07", "isOffDay": true }]
},
{
"year": 2019,
"description": "9月13日放假与周末连休。",
"expected": [{ "date": "2019-09-13", "isOffDay": true }]
},
{
"year": 2019,
"description": "10月1日至7日放假调休共7天。9月29日星期日、10月12日星期六上班。",
"expected": [
{ "date": "2019-10-01", "isOffDay": true },
{ "date": "2019-10-02", "isOffDay": true },
{ "date": "2019-10-03", "isOffDay": true },
{ "date": "2019-10-04", "isOffDay": true },
{ "date": "2019-10-05", "isOffDay": true },
{ "date": "2019-10-06", "isOffDay": true },
{ "date": "2019-10-07", "isOffDay": true },
{ "date": "2019-09-29", "isOffDay": false },
{ "date": "2019-10-12", "isOffDay": false }
]
},
{
"year": 2018,
"description": "1月1日放假与周末连休。",
"expected": [{ "date": "2018-01-01", "isOffDay": true }]
},
{
"year": 2018,
"description": "2月15日至21日放假调休共7天。2月11日星期日、2月24日星期六上班。",
"expected": [
{ "date": "2018-02-15", "isOffDay": true },
{ "date": "2018-02-16", "isOffDay": true },
{ "date": "2018-02-17", "isOffDay": true },
{ "date": "2018-02-18", "isOffDay": true },
{ "date": "2018-02-19", "isOffDay": true },
{ "date": "2018-02-20", "isOffDay": true },
{ "date": "2018-02-21", "isOffDay": true },
{ "date": "2018-02-11", "isOffDay": false },
{ "date": "2018-02-24", "isOffDay": false }
]
},
{
"year": 2018,
"description": "4月5日至7日放假调休共3天。4月8日星期日上班。",
"expected": [
{ "date": "2018-04-05", "isOffDay": true },
{ "date": "2018-04-06", "isOffDay": true },
{ "date": "2018-04-07", "isOffDay": true },
{ "date": "2018-04-08", "isOffDay": false }
]
},
{
"year": 2018,
"description": "4月29日至5月1日放假调休共3天。4月28日星期六上班。",
"expected": [
{ "date": "2018-04-29", "isOffDay": true },
{ "date": "2018-04-30", "isOffDay": true },
{ "date": "2018-05-01", "isOffDay": true },
{ "date": "2018-04-28", "isOffDay": false }
]
},
{
"year": 2018,
"description": "6月18日放假与周末连休。",
"expected": [{ "date": "2018-06-18", "isOffDay": true }]
},
{
"year": 2018,
"description": "9月24日放假与周末连休。",
"expected": [{ "date": "2018-09-24", "isOffDay": true }]
},
{
"year": 2018,
"description": "10月1日至7日放假调休共7天。9月29日星期六、9月30日星期日上班。",
"expected": [
{ "date": "2018-10-01", "isOffDay": true },
{ "date": "2018-10-02", "isOffDay": true },
{ "date": "2018-10-03", "isOffDay": true },
{ "date": "2018-10-04", "isOffDay": true },
{ "date": "2018-10-05", "isOffDay": true },
{ "date": "2018-10-06", "isOffDay": true },
{ "date": "2018-10-07", "isOffDay": true },
{ "date": "2018-09-29", "isOffDay": false },
{ "date": "2018-09-30", "isOffDay": false }
]
},
{
"year": 2016,
"description": "1月1日放假与周末连休。",
"expected": [{ "date": "2016-01-01", "isOffDay": true }]
},
{
"year": 2016,
"description": "2月7日至13日放假调休共7天。2月6日星期六、2月14日星期日上班。",
"expected": [
{ "date": "2016-02-07", "isOffDay": true },
{ "date": "2016-02-08", "isOffDay": true },
{ "date": "2016-02-09", "isOffDay": true },
{ "date": "2016-02-10", "isOffDay": true },
{ "date": "2016-02-11", "isOffDay": true },
{ "date": "2016-02-12", "isOffDay": true },
{ "date": "2016-02-13", "isOffDay": true },
{ "date": "2016-02-06", "isOffDay": false },
{ "date": "2016-02-14", "isOffDay": false }
]
},
{
"year": 2016,
"description": "4月4日放假与周末连休。",
"expected": [{ "date": "2016-04-04", "isOffDay": true }]
},
{
"year": 2016,
"description": "5月1日放假5月2日星期一补休。",
"expected": [
{ "date": "2016-05-01", "isOffDay": true },
{ "date": "2016-05-02", "isOffDay": true }
]
},
{
"year": 2016,
"description": "6月9日至11日放假调休共3天。6月12日星期日上班。",
"expected": [
{ "date": "2016-06-09", "isOffDay": true },
{ "date": "2016-06-10", "isOffDay": true },
{ "date": "2016-06-11", "isOffDay": true },
{ "date": "2016-06-12", "isOffDay": false }
]
},
{
"year": 2016,
"description": "9月15日至17日放假调休共3天。9月18日星期日上班。",
"expected": [
{ "date": "2016-09-15", "isOffDay": true },
{ "date": "2016-09-16", "isOffDay": true },
{ "date": "2016-09-17", "isOffDay": true },
{ "date": "2016-09-18", "isOffDay": false }
]
},
{
"year": 2016,
"description": "10月1日至7日放假调休共7天。10月8日星期六、10月9日星期日上班。",
"expected": [
{ "date": "2016-10-01", "isOffDay": true },
{ "date": "2016-10-02", "isOffDay": true },
{ "date": "2016-10-03", "isOffDay": true },
{ "date": "2016-10-04", "isOffDay": true },
{ "date": "2016-10-05", "isOffDay": true },
{ "date": "2016-10-06", "isOffDay": true },
{ "date": "2016-10-07", "isOffDay": true },
{ "date": "2016-10-08", "isOffDay": false },
{ "date": "2016-10-09", "isOffDay": false }
]
},
{
"year": 2016,
"description": "5月1日放假5月2日星期一补休。",
"expected": [
{ "date": "2016-05-01", "isOffDay": true },
{ "date": "2016-05-02", "isOffDay": true }
]
},
{
"year": 2014,
"description": "1月1日放假1天。",
"expected": [{ "date": "2014-01-01", "isOffDay": true }]
},
{
"year": 2014,
"description": "1月31日至2月6日放假调休共7天。1月26日星期日、2月8日星期六上班。",
"expected": [
{ "date": "2014-01-31", "isOffDay": true },
{ "date": "2014-02-01", "isOffDay": true },
{ "date": "2014-02-02", "isOffDay": true },
{ "date": "2014-02-03", "isOffDay": true },
{ "date": "2014-02-04", "isOffDay": true },
{ "date": "2014-02-05", "isOffDay": true },
{ "date": "2014-02-06", "isOffDay": true },
{ "date": "2014-01-26", "isOffDay": false },
{ "date": "2014-02-08", "isOffDay": false }
]
},
{
"year": 2014,
"description": "4月5日放假4月7日星期一补休。",
"expected": [
{ "date": "2014-04-05", "isOffDay": true },
{ "date": "2014-04-07", "isOffDay": true }
]
},
{
"year": 2014,
"description": "5月1日至3日放假调休共3天。5月4日星期日上班。",
"expected": [
{ "date": "2014-05-01", "isOffDay": true },
{ "date": "2014-05-02", "isOffDay": true },
{ "date": "2014-05-03", "isOffDay": true },
{ "date": "2014-05-04", "isOffDay": false }
]
},
{
"year": 2014,
"description": "6月2日放假与周末连休。",
"expected": [{ "date": "2014-06-02", "isOffDay": true }]
},
{
"year": 2014,
"description": "9月8日放假与周末连休。",
"expected": [{ "date": "2014-09-08", "isOffDay": true }]
},
{
"year": 2014,
"description": "10月1日至7日放假调休共7天。9月28日星期日、10月11日星期六上班。",
"expected": [
{ "date": "2014-10-01", "isOffDay": true },
{ "date": "2014-10-02", "isOffDay": true },
{ "date": "2014-10-03", "isOffDay": true },
{ "date": "2014-10-04", "isOffDay": true },
{ "date": "2014-10-05", "isOffDay": true },
{ "date": "2014-10-06", "isOffDay": true },
{ "date": "2014-10-07", "isOffDay": true },
{ "date": "2014-09-28", "isOffDay": false },
{ "date": "2014-10-11", "isOffDay": false }
]
},
{
"year": 2013,
"description": "1月1日至3日放假调休共3天。1月5日(星期六)、1月6日(星期日)上班。",
"expected": [
{ "date": "2013-01-01", "isOffDay": true },
{ "date": "2013-01-02", "isOffDay": true },
{ "date": "2013-01-03", "isOffDay": true },
{ "date": "2013-01-05", "isOffDay": false },
{ "date": "2013-01-06", "isOffDay": false }
]
},
{
"year": 2013,
"description": "2月9日至15日放假调休共7天。2月16日(星期六)、2月17日(星期日)上班。",
"expected": [
{ "date": "2013-02-09", "isOffDay": true },
{ "date": "2013-02-10", "isOffDay": true },
{ "date": "2013-02-11", "isOffDay": true },
{ "date": "2013-02-12", "isOffDay": true },
{ "date": "2013-02-13", "isOffDay": true },
{ "date": "2013-02-14", "isOffDay": true },
{ "date": "2013-02-15", "isOffDay": true },
{ "date": "2013-02-16", "isOffDay": false },
{ "date": "2013-02-17", "isOffDay": false }
]
},
{
"year": 2013,
"description": "4月4日至6日放假调休共3天。4月7日(星期日)上班。",
"expected": [
{ "date": "2013-04-04", "isOffDay": true },
{ "date": "2013-04-05", "isOffDay": true },
{ "date": "2013-04-06", "isOffDay": true },
{ "date": "2013-04-07", "isOffDay": false }
]
},
{
"year": 2013,
"description": "4月29日至5月1日放假调休共3天。4月27日(星期六)、4月28日(星期日)上班。",
"expected": [
{ "date": "2013-04-29", "isOffDay": true },
{ "date": "2013-04-30", "isOffDay": true },
{ "date": "2013-05-01", "isOffDay": true },
{ "date": "2013-04-27", "isOffDay": false },
{ "date": "2013-04-28", "isOffDay": false }
]
},
{
"year": 2013,
"description": "6月10日至12日放假调休共3天。6月8日(星期六)、6月9日(星期日)上班。",
"expected": [
{ "date": "2013-06-10", "isOffDay": true },
{ "date": "2013-06-11", "isOffDay": true },
{ "date": "2013-06-12", "isOffDay": true },
{ "date": "2013-06-08", "isOffDay": false },
{ "date": "2013-06-09", "isOffDay": false }
]
},
{
"year": 2013,
"description": "9月19日至21日放假调休共3天。9月22日(星期日)上班。",
"expected": [
{ "date": "2013-09-19", "isOffDay": true },
{ "date": "2013-09-20", "isOffDay": true },
{ "date": "2013-09-21", "isOffDay": true },
{ "date": "2013-09-22", "isOffDay": false }
]
},
{
"year": 2013,
"description": "10月1日至7日放假调休共7天。9月29日(星期日)、10月12日(星期六)上班。",
"expected": [
{ "date": "2013-10-01", "isOffDay": true },
{ "date": "2013-10-02", "isOffDay": true },
{ "date": "2013-10-03", "isOffDay": true },
{ "date": "2013-10-04", "isOffDay": true },
{ "date": "2013-10-05", "isOffDay": true },
{ "date": "2013-10-06", "isOffDay": true },
{ "date": "2013-10-07", "isOffDay": true },
{ "date": "2013-09-29", "isOffDay": false },
{ "date": "2013-10-12", "isOffDay": false }
]
},
{
"year": 2011,
"description": "2月2日农历除夕至8日放假调休共7天。1月30日星期日、2月12日星期六上班。",
"expected": [
{ "date": "2011-02-02", "isOffDay": true },
{ "date": "2011-02-03", "isOffDay": true },
{ "date": "2011-02-04", "isOffDay": true },
{ "date": "2011-02-05", "isOffDay": true },
{ "date": "2011-02-06", "isOffDay": true },
{ "date": "2011-02-07", "isOffDay": true },
{ "date": "2011-02-08", "isOffDay": true },
{ "date": "2011-01-30", "isOffDay": false },
{ "date": "2011-02-12", "isOffDay": false }
]
},
{
"year": 2008,
"description": "2007年12月30日—2008年1月1日放假共3天。其中1月1日星期二为法定节假日12月30日星期日为公休日12月29日星期六公休日调至12月31日星期一12月29日星期六上班。",
"expected": [
{ "date": "2007-12-30", "isOffDay": true },
{ "date": "2007-12-31", "isOffDay": true },
{ "date": "2008-01-01", "isOffDay": true },
{ "date": "2007-12-29", "isOffDay": false }
]
},
{
"year": 2008,
"description": "2月6日—12日农历除夕至正月初六放假共7天。其中2月6日除夕、2月7日春节、2月8日正月初二为法定节假日2月9日星期六、2月10日星期日照常公休2月2日星期六、2月3日星期日两个公休日调至2月11日星期一、2月12日星期二2月2日星期六、2月3日星期日上班。",
"expected": [
{ "date": "2008-02-06", "isOffDay": true },
{ "date": "2008-02-07", "isOffDay": true },
{ "date": "2008-02-08", "isOffDay": true },
{ "date": "2008-02-09", "isOffDay": true },
{ "date": "2008-02-10", "isOffDay": true },
{ "date": "2008-02-11", "isOffDay": true },
{ "date": "2008-02-12", "isOffDay": true },
{ "date": "2008-02-02", "isOffDay": false },
{ "date": "2008-02-03", "isOffDay": false }
]
},
{
"year": 2008,
"description": "4月4日—6日放假共3天。其中4月4日清明节为法定节假日4月5日星期六、4月6日星期日照常公休。",
"expected": [
{ "date": "2008-04-04", "isOffDay": true },
{ "date": "2008-04-05", "isOffDay": true },
{ "date": "2008-04-06", "isOffDay": true }
]
},
{
"year": 2008,
"description": "5月1日—3日放假共3天。其中5月1日为法定节假日5月3日星期六为公休日5月4日星期日公休日调至5月2日星期五5月4日星期日上班。",
"expected": [
{ "date": "2008-05-01", "isOffDay": true },
{ "date": "2008-05-02", "isOffDay": true },
{ "date": "2008-05-03", "isOffDay": true },
{ "date": "2008-05-04", "isOffDay": false }
]
},
{
"year": 2008,
"description": "6月7日—9日放假共3天。其中6月7日星期六照常公休6月8日农历五月初五端午节为法定节假日6月8日星期日公休日调至6月9日星期一。",
"expected": [
{ "date": "2008-06-07", "isOffDay": true },
{ "date": "2008-06-08", "isOffDay": true },
{ "date": "2008-06-09", "isOffDay": true }
]
},
{
"year": 2008,
"description": "9月13日—15日放假共3天。其中9月13日星期六为公休日9月14日农历八月十五中秋节为法定节假日9月14日星期日公休日调至9月15日星期一。",
"expected": [
{ "date": "2008-09-13", "isOffDay": true },
{ "date": "2008-09-14", "isOffDay": true },
{ "date": "2008-09-15", "isOffDay": true }
]
},
{
"year": 2008,
"description": "9月29日—10月5日放假共7天。其中10月1日、2日、3日为法定节假日9月27日星期六、9月28日星期日两个公休日调至9月29日星期一、30日星期二10月4日星期六、5日星期日照常公休。",
"expected": [
{ "date": "2008-09-29", "isOffDay": true },
{ "date": "2008-09-30", "isOffDay": true },
{ "date": "2008-10-01", "isOffDay": true },
{ "date": "2008-10-02", "isOffDay": true },
{ "date": "2008-10-03", "isOffDay": true },
{ "date": "2008-10-04", "isOffDay": true },
{ "date": "2008-10-05", "isOffDay": true },
{ "date": "2008-09-27", "isOffDay": false },
{ "date": "2008-09-28", "isOffDay": false }
]
},
{
"year": 2007,
"description": "1月1日—3日放假共3天。其中1月1日为法定假日将2006年12月30日星期六)、31日星期日两个公休日分别调至2007年1月2日、3日2006年12月30日星期六)、31日星期日上班。",
"expected": [
{ "date": "2007-01-01", "isOffDay": true },
{ "date": "2007-01-02", "isOffDay": true },
{ "date": "2007-01-03", "isOffDay": true },
{ "date": "2006-12-30", "isOffDay": false },
{ "date": "2006-12-31", "isOffDay": false }
]
},
{
"year": 2007,
"description": "2月18日—24日即农历大年初一至初七放假共7天。其中18日、19日、20日为法定假日将17日星期六)、18日星期日)、25日星期日三个公休日分别调至21日星期三)、22日星期四)、23日星期五)24日星期六照常公休17日、25日上班。",
"expected": [
{ "date": "2007-02-18", "isOffDay": true },
{ "date": "2007-02-19", "isOffDay": true },
{ "date": "2007-02-20", "isOffDay": true },
{ "date": "2007-02-21", "isOffDay": true },
{ "date": "2007-02-22", "isOffDay": true },
{ "date": "2007-02-23", "isOffDay": true },
{ "date": "2007-02-24", "isOffDay": true },
{ "date": "2007-02-17", "isOffDay": false },
{ "date": "2007-02-25", "isOffDay": false }
]
},
{
"year": 2007,
"description": "5月1日—7日放假共7天。其中1日、2日、3日为法定假日将4月28日星期六)、29日星期日两个公休日调至5月4日星期五)、7日星期一)5月5日星期六)、6日星期日照常公休4月28日、29日上班。",
"expected": [
{ "date": "2007-05-01", "isOffDay": true },
{ "date": "2007-05-02", "isOffDay": true },
{ "date": "2007-05-03", "isOffDay": true },
{ "date": "2007-05-04", "isOffDay": true },
{ "date": "2007-05-05", "isOffDay": true },
{ "date": "2007-05-06", "isOffDay": true },
{ "date": "2007-05-07", "isOffDay": true },
{ "date": "2007-04-28", "isOffDay": false },
{ "date": "2007-04-29", "isOffDay": false }
]
},
{
"year": 2007,
"description": "10月1日—7日放假共7天。其中1日、2日、3日为法定假日将9月29日星期六)、30日星期日两个公休日调至10月4日星期四)、5日星期五)10月6日星期六)、7日星期日照常公休9月29日、30日上班。",
"expected": [
{ "date": "2007-10-01", "isOffDay": true },
{ "date": "2007-10-02", "isOffDay": true },
{ "date": "2007-10-03", "isOffDay": true },
{ "date": "2007-10-04", "isOffDay": true },
{ "date": "2007-10-05", "isOffDay": true },
{ "date": "2007-10-06", "isOffDay": true },
{ "date": "2007-10-07", "isOffDay": true },
{ "date": "2007-09-29", "isOffDay": false },
{ "date": "2007-09-30", "isOffDay": false }
]
}
]

436
scripts/fetch.py Normal file
View File

@@ -0,0 +1,436 @@
#!/usr/bin/env python3
"""Fetch holidays from gov.cn """
import argparse
import json
import re
from datetime import date, timedelta
from itertools import chain
from typing import Iterator, List, Optional, Tuple
import bs4
import requests
SEARCH_URL = "http://sousuo.gov.cn/s.htm"
PAPER_EXCLUDE = [
"http://www.gov.cn/zhengce/content/2014-09/29/content_9102.htm",
"http://www.gov.cn/zhengce/content/2015-02/09/content_9466.htm",
]
PAPER_INCLUDE = {
2015: ["http://www.gov.cn/zhengce/content/2015-05/13/content_9742.htm"]
}
PRE_PARSED_PAPERS = {
"http://www.gov.cn/zhengce/content/2015-05/13/content_9742.htm": [
{
"name": "抗日战争暨世界反法西斯战争胜利70周年纪念日",
"date": date(2015, 9, 3),
"isOffDay": True,
},
{
"name": "抗日战争暨世界反法西斯战争胜利70周年纪念日",
"date": date(2015, 9, 4),
"isOffDay": True,
},
{
"name": "抗日战争暨世界反法西斯战争胜利70周年纪念日",
"date": date(2015, 9, 5),
"isOffDay": True,
},
{
"name": "抗日战争暨世界反法西斯战争胜利70周年纪念日",
"date": date(2015, 9, 6),
"isOffDay": False,
},
],
"http://www.gov.cn/zhengce/content/2020-01/27/content_5472352.htm": [
{
"name": "春节",
"date": date(2020, 1, 31),
"isOffDay": True,
},
{
"name": "春节",
"date": date(2020, 2, 1),
"isOffDay": True,
},
{
"name": "春节",
"date": date(2020, 2, 2),
"isOffDay": True,
},
{
"name": "春节",
"date": date(2020, 2, 3),
"isOffDay": False,
},
],
}
def _raise_for_status_200(resp: requests.Response):
resp.raise_for_status()
if resp.status_code != 200:
raise requests.HTTPError(
"request failed: %d: %s" % (resp.status_code, resp.request.url),
response=resp,
)
def get_paper_urls(year: int) -> List[str]:
"""Find year related paper urls.
Args:
year (int): eg. 2018
Returns:
List[str]: Urls newlest first.
"""
resp = requests.get(
SEARCH_URL,
params={
"t": "paper",
"advance": "true",
"title": year,
"q": "假期",
"pcodeJiguan": "国办发明电",
"puborg": "国务院办公厅",
},
)
_raise_for_status_200(resp)
ret = re.findall(
r'<li class="res-list".*?<a href="(.+?)".*?</li>', resp.text, flags=re.S
)
ret = [i for i in ret if i not in PAPER_EXCLUDE]
ret += PAPER_INCLUDE.get(year, [])
ret.sort()
if not ret and date.today().year >= year:
raise RuntimeError("could not found papers for %d" % year)
return ret
def get_paper(url: str) -> str:
"""Extract paper text from url.
Args:
url (str): Paper url.
Returns:
str: Extracted paper text.
"""
assert re.match(
r"http://www.gov.cn/zhengce/content/\d{4}-\d{2}/\d{2}/content_\d+.htm", url
), "Site changed, need human verify"
response = requests.get(url)
_raise_for_status_200(response)
response.encoding = "utf-8"
soup = bs4.BeautifulSoup(response.text, features="html.parser")
container = soup.find("td", class_="b12c")
assert container, f"Can not get paper container from url: {url}"
ret = container.get_text().replace("\u3000\u3000", "\n")
assert ret, f"Can not get paper content from url: {url}"
return ret
def get_rules(paper: str) -> Iterator[Tuple[str, str]]:
"""Extract rules from paper.
Args:
paper (str): Paper text
Raises:
NotImplementedError: When find no rules.
Returns:
Iterator[Tuple[str, str]]: (name, description)
"""
lines: list = paper.splitlines()
lines = sorted(set(lines), key=lines.index)
count = 0
for i in chain(get_normal_rules(lines), get_patch_rules(lines)):
count += 1
yield i
if not count:
raise NotImplementedError(lines)
def get_normal_rules(lines: Iterator[str]) -> Iterator[Tuple[str, str]]:
"""Get normal holiday rule for a year
Args:
lines (Iterator[str]): paper content
Returns:
Iterator[Tuple[str, str]]: (name, description)
"""
for i in lines:
match = re.match(r"[一二三四五六七八九十]、(.+?)(.+)", i)
if match:
yield match.groups()
def get_patch_rules(lines: Iterator[str]) -> Iterator[Tuple[str, str]]:
"""Get holiday patch rule for existed holiday
Args:
lines (Iterator[str]): paper content
Returns:
Iterator[Tuple[str, str]]: (name, description)
"""
name = None
for i in lines:
match = re.match(r".*\d+年([^和、]{2,})(?:假期|放假).*安排", i)
if match:
name = match.group(1)
if not name:
continue
match = re.match(r"^[一二三四五六七八九十]、(.+)$", i)
if not match:
continue
description = match.group(1)
if re.match(r".*\d+月\d+日.*", description):
yield name, description
def _cast_int(value):
return int(value) if value else None
class DescriptionParser:
"""Parser for holiday shift description."""
def __init__(self, description: str, year: int):
self.description = description
self.year = year
self.date_history = list()
def parse(self) -> Iterator[dict]:
"""Generator for description parsing result.
Args:
year (int): Context year
"""
del self.date_history[:]
for i in re.split("[,。;]", self.description):
for j in SentenceParser(self, i).parse():
yield j
if not self.date_history:
raise NotImplementedError(self.description)
def get_date(self, year: Optional[int], month: Optional[int], day: int) -> date:
"""Get date in context.
Args:
year (Optional[int]): year
month (int): month
day (int): day
Returns:
date: Date result
"""
assert day, "No day specified"
# Special case: month inherit
if month is None:
month = self.date_history[-1].month
# Special case: 12 month may mean previous year
if (
year is None
and month == 12
and self.date_history
and max(self.date_history) < date(year=self.year, month=2, day=1)
):
year = self.year - 1
year = year or self.year
return date(year=year, month=month, day=day)
class SentenceParser:
"""Parser for holiday shift description sentence."""
def __init__(self, parent: DescriptionParser, sentence):
self.parent = parent
self.sentence = sentence
def extract_dates(self, text: str) -> Iterator[date]:
"""Extract date from text.
Args:
text (str): Text to extract
Returns:
Iterator[date]: Extracted dates.
"""
count = 0
text = text.replace("(", "").replace(")", "")
for i in chain(
*(method(self, text) for method in self.date_extraction_methods)
):
count += 1
is_seen = i in self.parent.date_history
self.parent.date_history.append(i)
if is_seen:
continue
yield i
if not count:
raise NotImplementedError(text)
def _extract_dates_1(self, value: str) -> Iterator[date]:
match = re.findall(r"(?:(\d+)年)?(?:(\d+)月)?(\d+)日", value)
for groups in match:
groups = [_cast_int(i) for i in groups]
assert len(groups) == 3, groups
yield self.parent.get_date(year=groups[0], month=groups[1], day=groups[2])
def _extract_dates_2(self, value: str) -> Iterator[date]:
value = re.sub(r".+?", "", value)
match = re.findall(
r"(?:(\d+)年)?(?:(\d+)月)?(\d+)日(?:至|-|—)(?:(\d+)年)?(?:(\d+)月)?(\d+)日", value
)
for groups in match:
groups = [_cast_int(i) for i in groups]
assert len(groups) == 6, groups
start = self.parent.get_date(year=groups[0], month=groups[1], day=groups[2])
end = self.parent.get_date(year=groups[3], month=groups[4], day=groups[5])
for i in range((end - start).days + 1):
yield start + timedelta(days=i)
def _extract_dates_3(self, value: str) -> Iterator[date]:
value = re.sub(r".+?", "", value)
match = re.findall(
r"(?:(\d+)年)?(?:(\d+)月)?(\d+)日(?:[^]+)?"
r"(?:、(?:(\d+)年)?(?:(\d+)月)?(\d+)日(?:[^]+)?)+",
value,
)
for groups in match:
groups = [_cast_int(i) for i in groups]
assert not (len(groups) % 3), groups
for i in range(0, len(groups), 3):
yield self.parent.get_date(
year=groups[i], month=groups[i + 1], day=groups[i + 2]
)
date_extraction_methods = [_extract_dates_1, _extract_dates_2, _extract_dates_3]
def parse(self) -> Iterator[dict]:
"""Parse days with memory
Args:
memory (set): Date memory
Returns:
Iterator[dict]: Days without name field.
"""
for method in self.parsing_methods:
for i in method(self):
yield i
def _parse_rest_1(self):
match = re.match(r"(.+)(放假|补休|调休|公休)+(?:\d+天)?$", self.sentence)
if match:
for i in self.extract_dates(match.group(1)):
yield {"date": i, "isOffDay": True}
def _parse_work_1(self):
match = re.match("(.+)上班$", self.sentence)
if match:
for i in self.extract_dates(match.group(1)):
yield {"date": i, "isOffDay": False}
def _parse_shift_1(self):
match = re.match("(.+)调至(.+)", self.sentence)
if match:
for i in self.extract_dates(match.group(1)):
yield {"date": i, "isOffDay": False}
for i in self.extract_dates(match.group(2)):
yield {"date": i, "isOffDay": True}
parsing_methods = [
_parse_rest_1,
_parse_work_1,
_parse_shift_1,
]
def parse_paper(year: int, url: str) -> Iterator[dict]:
"""Parse one paper
Args:
year (int): Year
url (str): Paper url
Returns:
Iterator[dict]: Days
"""
if url in PRE_PARSED_PAPERS:
yield from PRE_PARSED_PAPERS[url]
return
paper = get_paper(url)
rules = get_rules(paper)
ret = (
{"name": name, **i}
for name, description in rules
for i in DescriptionParser(description, year).parse()
)
try:
for i in ret:
yield i
except NotImplementedError as ex:
raise RuntimeError("Can not parse paper", url) from ex
def fetch_holiday(year: int):
"""Fetch holiday data."""
papers = get_paper_urls(year)
days = dict()
for k in (j for i in papers for j in parse_paper(year, i)):
days[k["date"]] = k
return {
"year": year,
"papers": papers,
"days": sorted(days.values(), key=lambda x: x["date"]),
}
def main():
parser = argparse.ArgumentParser()
parser.add_argument("year", type=int)
args = parser.parse_args()
year = args.year
print(
json.dumps(
fetch_holiday(year), indent=4, ensure_ascii=False, cls=CustomJSONEncoder
)
)
class CustomJSONEncoder(json.JSONEncoder):
"""Custom json encoder."""
def default(self, o):
# pylint:disable=method-hidden
if isinstance(o, date):
return o.isoformat()
return super().default(o)
if __name__ == "__main__":
main()

56
scripts/fetch_test.py Normal file
View File

@@ -0,0 +1,56 @@
"""Test module `fetch_holidays`. """
import json
import pytest
from fetch import (
CustomJSONEncoder,
DescriptionParser,
get_paper,
get_paper_urls,
get_rules,
)
from filetools import _file_path
def test_get_paper_urls():
assert get_paper_urls(2019) == [
"http://www.gov.cn/zhengce/content/2018-12/06/content_5346276.htm",
"http://www.gov.cn/zhengce/content/2019-03/22/content_5375877.htm",
]
def test_get_rules():
assert list(
get_rules(
get_paper(
"http://www.gov.cn/zhengce/content/2019-03/22/content_5375877.htm"
)
)
) == [("劳动节", "2019年5月1日至4日放假调休共4天。4月28日星期日、5月5日星期日上班。")]
def _normalize(iterable):
return sorted(
json.loads(json.dumps(list(iterable), cls=CustomJSONEncoder)),
key=lambda x: x["date"],
)
def _description_parsing_cases():
with open(
_file_path("description_parsing_cases.json"),
"r",
encoding="utf-8",
) as f:
return json.load(f)
@pytest.mark.parametrize("case", _description_parsing_cases())
def test_parse_description(case):
year, description, expected = case["year"], case["description"], case["expected"]
assert _normalize(DescriptionParser(description, year).parse()) == _normalize(
expected
), case

10
scripts/filetools.py Normal file
View File

@@ -0,0 +1,10 @@
"""Tools for files. """
import os
__dirname__ = os.path.abspath(os.path.dirname(__file__))
def _file_path(*other):
return os.path.abspath(os.path.join(__dirname__, *other))

86
scripts/generate_ics.py Normal file
View File

@@ -0,0 +1,86 @@
import datetime
from typing import Any, Iterator, Sequence, Text, Tuple
from icalendar import Event, Calendar, Timezone, TimezoneStandard
def _create_timezone():
tz = Timezone()
tz.add("TZID", "Asia/Shanghai")
tz_standard = TimezoneStandard()
tz_standard.add("DTSTART", datetime.datetime(1970, 1, 1))
tz_standard.add("TZOFFSETFROM", datetime.timedelta(hours=8))
tz_standard.add("TZOFFSETTO", datetime.timedelta(hours=8))
tz.add_component(tz_standard)
return tz
def _create_event(event_name, start, end):
# 创建事件/日程
event = Event()
event.add("SUMMARY", event_name)
event.add("DTSTART", start)
event.add("DTEND", end)
# 创建时间
event.add("DTSTAMP", start)
# UID保证唯一
event["UID"] = f"{start}/{end}/NateScarlet/holiday-cn"
return event
def _cast_date(v: Any) -> datetime.date:
if isinstance(v, datetime.date):
return v
if isinstance(v, str):
return datetime.date.fromisoformat(v)
raise NotImplementedError("can not convert to date: %s" % v)
def _iter_date_ranges(days: Sequence[dict]) -> Iterator[Tuple[dict, dict]]:
if len(days) == 0:
return
if len(days) == 1:
yield days[0], days[0]
return
fr, to = days[0], days[0]
for cur in days[1:]:
if (_cast_date(cur["date"]) - _cast_date(to["date"])).days == 1 and cur[
"isOffDay"
] == to["isOffDay"]:
to = cur
else:
yield fr, to
fr, to = cur, cur
yield fr, to
def generate_ics(days: Sequence[dict], filename: Text) -> None:
"""Generate ics from days."""
cal = Calendar()
cal.add("X-WR-CALNAME", "中国法定节假日")
cal.add("X-WR-CALDESC", "中国法定节假日数据,自动每日抓取国务院公告。")
cal.add("VERSION", "2.0")
cal.add("METHOD", "PUBLISH")
cal.add("CLASS", "PUBLIC")
cal.add_component(_create_timezone())
days = sorted(days, key=lambda x: x["date"])
for fr, to in _iter_date_ranges(days):
start = _cast_date(fr["date"])
end = _cast_date(to["date"]) + datetime.timedelta(days=1)
name = fr["name"] + "假期"
if not fr["isOffDay"]:
name = "上班(补" + name + ")"
cal.add_component(_create_event(name, start, end))
with open(filename, "wb") as f:
f.write(cal.to_ical())

181
scripts/update.py Normal file
View File

@@ -0,0 +1,181 @@
#!/usr/bin/env python3
"""Script for updating data. """
import argparse
import json
import os
import re
import subprocess
from datetime import datetime, timedelta, tzinfo
from tempfile import mkstemp
from typing import Iterator
from zipfile import ZipFile
from tqdm import tqdm
from fetch import CustomJSONEncoder, fetch_holiday
from generate_ics import generate_ics
class ChinaTimezone(tzinfo):
"""Timezone of china."""
def tzname(self, dt):
return "UTC+8"
def utcoffset(self, dt):
return timedelta(hours=8)
def dst(self, dt):
return timedelta()
__dirname__ = os.path.abspath(os.path.dirname(__file__))
def _file_path(*other):
return os.path.join(__dirname__, *other)
def update_data(year: int) -> Iterator[str]:
"""Update and store data for a year."""
json_filename = _file_path(f"{year}.json")
ics_filename = _file_path(f"{year}.ics")
with open(json_filename, "w", encoding="utf-8", newline="\n") as f:
data = fetch_holiday(year)
json.dump(
dict(
(
(
"$schema",
"https://raw.githubusercontent.com/NateScarlet/holiday-cn/master/schema.json",
),
(
"$id",
f"https://raw.githubusercontent.com/NateScarlet/holiday-cn/master/{year}.json",
),
*data.items(),
)
),
f,
indent=4,
ensure_ascii=False,
cls=CustomJSONEncoder,
)
yield json_filename
generate_ics(data["days"], ics_filename)
yield ics_filename
def update_main_ics(fr_year, to_year):
all_days = []
for year in range(fr_year, to_year + 1):
filename = _file_path(f"{year}.json")
if not os.path.isfile(filename):
continue
with open(filename, "r", encoding="utf8") as inf:
data = json.loads(inf.read())
all_days.extend(data.get("days"))
filename = _file_path("holiday-cn.ics")
generate_ics(
all_days,
filename,
)
return filename
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--all",
action="store_true",
help="Update all years since 2007, default is this year and next year",
)
parser.add_argument(
"--release",
action="store_true",
help="create new release if repository data is not up to date",
)
args = parser.parse_args()
now = datetime.now(ChinaTimezone())
is_release = args.release
filenames = []
progress = tqdm(range(2007 if args.all else now.year, now.year + 2))
for i in progress:
progress.set_description(f"Updating {i} data")
filenames += list(update_data(i))
progress.set_description("Updating holiday-cn.ics")
filenames.append(update_main_ics(now.year - 4, now.year + 1))
print("")
subprocess.run(["hub", "add", *filenames], check=True)
diff = subprocess.run(
["hub", "diff", "--stat", "--cached", "*.json", "*.ics"],
check=True,
stdout=subprocess.PIPE,
encoding="utf-8",
).stdout
if not diff:
print("Already up to date.")
return
if not is_release:
print("Updated repository data, skip release since not specified `--release`")
return
subprocess.run(
[
"hub",
"commit",
"-m",
"chore(release): update holiday data",
"-m",
"[skip ci]",
],
check=True,
)
subprocess.run(["hub", "push"], check=True)
tag = now.strftime("%Y.%m.%d")
temp_note_fd, temp_note_name = mkstemp()
with open(temp_note_fd, "w", encoding="utf-8") as f:
f.write(tag + "\n\n```diff\n" + diff + "\n```\n")
os.makedirs(_file_path("dist"), exist_ok=True)
zip_path = _file_path("dist", f"holiday-cn-{tag}.zip")
pack_data(zip_path)
subprocess.run(
[
"hub",
"release",
"create",
"-F",
temp_note_name,
"-a",
f"{zip_path}#JSON数据",
tag,
],
check=True,
)
os.unlink(temp_note_name)
def pack_data(file):
"""Pack data json in zip file."""
zip_file = ZipFile(file, "w")
for i in os.listdir(__dirname__):
if not re.match(r"\d+\.json", i):
continue
zip_file.write(_file_path(i), i)
if __name__ == "__main__":
main()