chore: move python code to /scripts

This commit is contained in:
NateScarlet
2022-12-09 16:52:02 +08:00
parent df637aa453
commit ef9f9cd5e3
11 changed files with 17 additions and 10 deletions
+497
View File
@@ -0,0 +1,497 @@
[
{
"year": 2019,
"description": "2018年12月30日至2019年1月1日放假调休,共3天。2018年12月29日(星期六)上班。",
"expected": [
{ "date": "2018-12-30", "isOffDay": true },
{ "date": "2018-12-31", "isOffDay": true },
{ "date": "2019-01-01", "isOffDay": true },
{ "date": "2018-12-29", "isOffDay": false }
]
},
{
"year": 2019,
"description": "2月4日至10日放假调休,共7天。2月2日(星期六)、2月3日(星期日)上班。",
"expected": [
{ "date": "2019-02-04", "isOffDay": true },
{ "date": "2019-02-05", "isOffDay": true },
{ "date": "2019-02-06", "isOffDay": true },
{ "date": "2019-02-07", "isOffDay": true },
{ "date": "2019-02-08", "isOffDay": true },
{ "date": "2019-02-09", "isOffDay": true },
{ "date": "2019-02-10", "isOffDay": true },
{ "date": "2019-02-02", "isOffDay": false },
{ "date": "2019-02-03", "isOffDay": false }
]
},
{
"year": 2019,
"description": "4月5日放假,与周末连休。",
"expected": [{ "date": "2019-04-05", "isOffDay": true }]
},
{
"year": 2019,
"description": "5月1日放假。",
"expected": [{ "date": "2019-05-01", "isOffDay": true }]
},
{
"year": 2019,
"description": "6月7日放假,与周末连休。",
"expected": [{ "date": "2019-06-07", "isOffDay": true }]
},
{
"year": 2019,
"description": "9月13日放假,与周末连休。",
"expected": [{ "date": "2019-09-13", "isOffDay": true }]
},
{
"year": 2019,
"description": "10月1日至7日放假调休,共7天。9月29日(星期日)、10月12日(星期六)上班。",
"expected": [
{ "date": "2019-10-01", "isOffDay": true },
{ "date": "2019-10-02", "isOffDay": true },
{ "date": "2019-10-03", "isOffDay": true },
{ "date": "2019-10-04", "isOffDay": true },
{ "date": "2019-10-05", "isOffDay": true },
{ "date": "2019-10-06", "isOffDay": true },
{ "date": "2019-10-07", "isOffDay": true },
{ "date": "2019-09-29", "isOffDay": false },
{ "date": "2019-10-12", "isOffDay": false }
]
},
{
"year": 2018,
"description": "1月1日放假,与周末连休。",
"expected": [{ "date": "2018-01-01", "isOffDay": true }]
},
{
"year": 2018,
"description": "2月15日至21日放假调休,共7天。2月11日(星期日)、2月24日(星期六)上班。",
"expected": [
{ "date": "2018-02-15", "isOffDay": true },
{ "date": "2018-02-16", "isOffDay": true },
{ "date": "2018-02-17", "isOffDay": true },
{ "date": "2018-02-18", "isOffDay": true },
{ "date": "2018-02-19", "isOffDay": true },
{ "date": "2018-02-20", "isOffDay": true },
{ "date": "2018-02-21", "isOffDay": true },
{ "date": "2018-02-11", "isOffDay": false },
{ "date": "2018-02-24", "isOffDay": false }
]
},
{
"year": 2018,
"description": "4月5日至7日放假调休,共3天。4月8日(星期日)上班。",
"expected": [
{ "date": "2018-04-05", "isOffDay": true },
{ "date": "2018-04-06", "isOffDay": true },
{ "date": "2018-04-07", "isOffDay": true },
{ "date": "2018-04-08", "isOffDay": false }
]
},
{
"year": 2018,
"description": "4月29日至5月1日放假调休,共3天。4月28日(星期六)上班。",
"expected": [
{ "date": "2018-04-29", "isOffDay": true },
{ "date": "2018-04-30", "isOffDay": true },
{ "date": "2018-05-01", "isOffDay": true },
{ "date": "2018-04-28", "isOffDay": false }
]
},
{
"year": 2018,
"description": "6月18日放假,与周末连休。",
"expected": [{ "date": "2018-06-18", "isOffDay": true }]
},
{
"year": 2018,
"description": "9月24日放假,与周末连休。",
"expected": [{ "date": "2018-09-24", "isOffDay": true }]
},
{
"year": 2018,
"description": "10月1日至7日放假调休,共7天。9月29日(星期六)、9月30日(星期日)上班。",
"expected": [
{ "date": "2018-10-01", "isOffDay": true },
{ "date": "2018-10-02", "isOffDay": true },
{ "date": "2018-10-03", "isOffDay": true },
{ "date": "2018-10-04", "isOffDay": true },
{ "date": "2018-10-05", "isOffDay": true },
{ "date": "2018-10-06", "isOffDay": true },
{ "date": "2018-10-07", "isOffDay": true },
{ "date": "2018-09-29", "isOffDay": false },
{ "date": "2018-09-30", "isOffDay": false }
]
},
{
"year": 2016,
"description": "1月1日放假,与周末连休。",
"expected": [{ "date": "2016-01-01", "isOffDay": true }]
},
{
"year": 2016,
"description": "2月7日至13日放假调休,共7天。2月6日(星期六)、2月14日(星期日)上班。",
"expected": [
{ "date": "2016-02-07", "isOffDay": true },
{ "date": "2016-02-08", "isOffDay": true },
{ "date": "2016-02-09", "isOffDay": true },
{ "date": "2016-02-10", "isOffDay": true },
{ "date": "2016-02-11", "isOffDay": true },
{ "date": "2016-02-12", "isOffDay": true },
{ "date": "2016-02-13", "isOffDay": true },
{ "date": "2016-02-06", "isOffDay": false },
{ "date": "2016-02-14", "isOffDay": false }
]
},
{
"year": 2016,
"description": "4月4日放假,与周末连休。",
"expected": [{ "date": "2016-04-04", "isOffDay": true }]
},
{
"year": 2016,
"description": "5月1日放假,5月2日(星期一)补休。",
"expected": [
{ "date": "2016-05-01", "isOffDay": true },
{ "date": "2016-05-02", "isOffDay": true }
]
},
{
"year": 2016,
"description": "6月9日至11日放假调休,共3天。6月12日(星期日)上班。",
"expected": [
{ "date": "2016-06-09", "isOffDay": true },
{ "date": "2016-06-10", "isOffDay": true },
{ "date": "2016-06-11", "isOffDay": true },
{ "date": "2016-06-12", "isOffDay": false }
]
},
{
"year": 2016,
"description": "9月15日至17日放假调休,共3天。9月18日(星期日)上班。",
"expected": [
{ "date": "2016-09-15", "isOffDay": true },
{ "date": "2016-09-16", "isOffDay": true },
{ "date": "2016-09-17", "isOffDay": true },
{ "date": "2016-09-18", "isOffDay": false }
]
},
{
"year": 2016,
"description": "10月1日至7日放假调休,共7天。10月8日(星期六)、10月9日(星期日)上班。",
"expected": [
{ "date": "2016-10-01", "isOffDay": true },
{ "date": "2016-10-02", "isOffDay": true },
{ "date": "2016-10-03", "isOffDay": true },
{ "date": "2016-10-04", "isOffDay": true },
{ "date": "2016-10-05", "isOffDay": true },
{ "date": "2016-10-06", "isOffDay": true },
{ "date": "2016-10-07", "isOffDay": true },
{ "date": "2016-10-08", "isOffDay": false },
{ "date": "2016-10-09", "isOffDay": false }
]
},
{
"year": 2016,
"description": "5月1日放假,5月2日(星期一)补休。",
"expected": [
{ "date": "2016-05-01", "isOffDay": true },
{ "date": "2016-05-02", "isOffDay": true }
]
},
{
"year": 2014,
"description": "1月1日放假1天。",
"expected": [{ "date": "2014-01-01", "isOffDay": true }]
},
{
"year": 2014,
"description": "1月31日至2月6日放假调休,共7天。1月26日(星期日)、2月8日(星期六)上班。",
"expected": [
{ "date": "2014-01-31", "isOffDay": true },
{ "date": "2014-02-01", "isOffDay": true },
{ "date": "2014-02-02", "isOffDay": true },
{ "date": "2014-02-03", "isOffDay": true },
{ "date": "2014-02-04", "isOffDay": true },
{ "date": "2014-02-05", "isOffDay": true },
{ "date": "2014-02-06", "isOffDay": true },
{ "date": "2014-01-26", "isOffDay": false },
{ "date": "2014-02-08", "isOffDay": false }
]
},
{
"year": 2014,
"description": "4月5日放假,4月7日(星期一)补休。",
"expected": [
{ "date": "2014-04-05", "isOffDay": true },
{ "date": "2014-04-07", "isOffDay": true }
]
},
{
"year": 2014,
"description": "5月1日至3日放假调休,共3天。5月4日(星期日)上班。",
"expected": [
{ "date": "2014-05-01", "isOffDay": true },
{ "date": "2014-05-02", "isOffDay": true },
{ "date": "2014-05-03", "isOffDay": true },
{ "date": "2014-05-04", "isOffDay": false }
]
},
{
"year": 2014,
"description": "6月2日放假,与周末连休。",
"expected": [{ "date": "2014-06-02", "isOffDay": true }]
},
{
"year": 2014,
"description": "9月8日放假,与周末连休。",
"expected": [{ "date": "2014-09-08", "isOffDay": true }]
},
{
"year": 2014,
"description": "10月1日至7日放假调休,共7天。9月28日(星期日)、10月11日(星期六)上班。",
"expected": [
{ "date": "2014-10-01", "isOffDay": true },
{ "date": "2014-10-02", "isOffDay": true },
{ "date": "2014-10-03", "isOffDay": true },
{ "date": "2014-10-04", "isOffDay": true },
{ "date": "2014-10-05", "isOffDay": true },
{ "date": "2014-10-06", "isOffDay": true },
{ "date": "2014-10-07", "isOffDay": true },
{ "date": "2014-09-28", "isOffDay": false },
{ "date": "2014-10-11", "isOffDay": false }
]
},
{
"year": 2013,
"description": "1月1日至3日放假调休,共3天。1月5日(星期六)、1月6日(星期日)上班。",
"expected": [
{ "date": "2013-01-01", "isOffDay": true },
{ "date": "2013-01-02", "isOffDay": true },
{ "date": "2013-01-03", "isOffDay": true },
{ "date": "2013-01-05", "isOffDay": false },
{ "date": "2013-01-06", "isOffDay": false }
]
},
{
"year": 2013,
"description": "2月9日至15日放假调休,共7天。2月16日(星期六)、2月17日(星期日)上班。",
"expected": [
{ "date": "2013-02-09", "isOffDay": true },
{ "date": "2013-02-10", "isOffDay": true },
{ "date": "2013-02-11", "isOffDay": true },
{ "date": "2013-02-12", "isOffDay": true },
{ "date": "2013-02-13", "isOffDay": true },
{ "date": "2013-02-14", "isOffDay": true },
{ "date": "2013-02-15", "isOffDay": true },
{ "date": "2013-02-16", "isOffDay": false },
{ "date": "2013-02-17", "isOffDay": false }
]
},
{
"year": 2013,
"description": "4月4日至6日放假调休,共3天。4月7日(星期日)上班。",
"expected": [
{ "date": "2013-04-04", "isOffDay": true },
{ "date": "2013-04-05", "isOffDay": true },
{ "date": "2013-04-06", "isOffDay": true },
{ "date": "2013-04-07", "isOffDay": false }
]
},
{
"year": 2013,
"description": "4月29日至5月1日放假调休,共3天。4月27日(星期六)、4月28日(星期日)上班。",
"expected": [
{ "date": "2013-04-29", "isOffDay": true },
{ "date": "2013-04-30", "isOffDay": true },
{ "date": "2013-05-01", "isOffDay": true },
{ "date": "2013-04-27", "isOffDay": false },
{ "date": "2013-04-28", "isOffDay": false }
]
},
{
"year": 2013,
"description": "6月10日至12日放假调休,共3天。6月8日(星期六)、6月9日(星期日)上班。",
"expected": [
{ "date": "2013-06-10", "isOffDay": true },
{ "date": "2013-06-11", "isOffDay": true },
{ "date": "2013-06-12", "isOffDay": true },
{ "date": "2013-06-08", "isOffDay": false },
{ "date": "2013-06-09", "isOffDay": false }
]
},
{
"year": 2013,
"description": "9月19日至21日放假调休,共3天。9月22日(星期日)上班。",
"expected": [
{ "date": "2013-09-19", "isOffDay": true },
{ "date": "2013-09-20", "isOffDay": true },
{ "date": "2013-09-21", "isOffDay": true },
{ "date": "2013-09-22", "isOffDay": false }
]
},
{
"year": 2013,
"description": "10月1日至7日放假调休,共7天。9月29日(星期日)、10月12日(星期六)上班。",
"expected": [
{ "date": "2013-10-01", "isOffDay": true },
{ "date": "2013-10-02", "isOffDay": true },
{ "date": "2013-10-03", "isOffDay": true },
{ "date": "2013-10-04", "isOffDay": true },
{ "date": "2013-10-05", "isOffDay": true },
{ "date": "2013-10-06", "isOffDay": true },
{ "date": "2013-10-07", "isOffDay": true },
{ "date": "2013-09-29", "isOffDay": false },
{ "date": "2013-10-12", "isOffDay": false }
]
},
{
"year": 2011,
"description": "2月2日(农历除夕)至8日放假调休,共7天。1月30日(星期日)、2月12日(星期六)上班。",
"expected": [
{ "date": "2011-02-02", "isOffDay": true },
{ "date": "2011-02-03", "isOffDay": true },
{ "date": "2011-02-04", "isOffDay": true },
{ "date": "2011-02-05", "isOffDay": true },
{ "date": "2011-02-06", "isOffDay": true },
{ "date": "2011-02-07", "isOffDay": true },
{ "date": "2011-02-08", "isOffDay": true },
{ "date": "2011-01-30", "isOffDay": false },
{ "date": "2011-02-12", "isOffDay": false }
]
},
{
"year": 2008,
"description": "2007年12月30日—2008年1月1日放假,共3天。其中,1月1日(星期二)为法定节假日,12月30日(星期日)为公休日,12月29日(星期六)公休日调至12月31日(星期一),12月29日(星期六)上班。",
"expected": [
{ "date": "2007-12-30", "isOffDay": true },
{ "date": "2007-12-31", "isOffDay": true },
{ "date": "2008-01-01", "isOffDay": true },
{ "date": "2007-12-29", "isOffDay": false }
]
},
{
"year": 2008,
"description": "2月6日—12日(农历除夕至正月初六)放假,共7天。其中,2月6日(除夕)、2月7日(春节)、2月8日(正月初二)为法定节假日,2月9日(星期六)、2月10日(星期日)照常公休,2月2日(星期六)、2月3日(星期日)两个公休日调至2月11日(星期一)、2月12日(星期二),2月2日(星期六)、2月3日(星期日)上班。",
"expected": [
{ "date": "2008-02-06", "isOffDay": true },
{ "date": "2008-02-07", "isOffDay": true },
{ "date": "2008-02-08", "isOffDay": true },
{ "date": "2008-02-09", "isOffDay": true },
{ "date": "2008-02-10", "isOffDay": true },
{ "date": "2008-02-11", "isOffDay": true },
{ "date": "2008-02-12", "isOffDay": true },
{ "date": "2008-02-02", "isOffDay": false },
{ "date": "2008-02-03", "isOffDay": false }
]
},
{
"year": 2008,
"description": "4月4日—6日放假,共3天。其中,4月4日(清明节)为法定节假日,4月5日(星期六)、4月6日(星期日)照常公休。",
"expected": [
{ "date": "2008-04-04", "isOffDay": true },
{ "date": "2008-04-05", "isOffDay": true },
{ "date": "2008-04-06", "isOffDay": true }
]
},
{
"year": 2008,
"description": "5月1日—3日放假,共3天。其中,5月1日为法定节假日,5月3日(星期六)为公休日,5月4日(星期日)公休日调至5月2日(星期五),5月4日(星期日)上班。",
"expected": [
{ "date": "2008-05-01", "isOffDay": true },
{ "date": "2008-05-02", "isOffDay": true },
{ "date": "2008-05-03", "isOffDay": true },
{ "date": "2008-05-04", "isOffDay": false }
]
},
{
"year": 2008,
"description": "6月7日—9日放假,共3天。其中,6月7日(星期六)照常公休,6月8日(农历五月初五,端午节)为法定节假日,6月8日(星期日)公休日调至6月9日(星期一)。",
"expected": [
{ "date": "2008-06-07", "isOffDay": true },
{ "date": "2008-06-08", "isOffDay": true },
{ "date": "2008-06-09", "isOffDay": true }
]
},
{
"year": 2008,
"description": "9月13日—15日放假,共3天。其中,9月13日(星期六)为公休日,9月14日(农历八月十五,中秋节)为法定节假日,9月14日(星期日)公休日调至9月15日(星期一)。",
"expected": [
{ "date": "2008-09-13", "isOffDay": true },
{ "date": "2008-09-14", "isOffDay": true },
{ "date": "2008-09-15", "isOffDay": true }
]
},
{
"year": 2008,
"description": "9月29日—10月5日放假,共7天。其中,10月1日、2日、3日为法定节假日,9月27日(星期六)、9月28日(星期日)两个公休日调至9月29日(星期一)、30日(星期二),10月4日(星期六)、5日(星期日)照常公休。",
"expected": [
{ "date": "2008-09-29", "isOffDay": true },
{ "date": "2008-09-30", "isOffDay": true },
{ "date": "2008-10-01", "isOffDay": true },
{ "date": "2008-10-02", "isOffDay": true },
{ "date": "2008-10-03", "isOffDay": true },
{ "date": "2008-10-04", "isOffDay": true },
{ "date": "2008-10-05", "isOffDay": true },
{ "date": "2008-09-27", "isOffDay": false },
{ "date": "2008-09-28", "isOffDay": false }
]
},
{
"year": 2007,
"description": "1月1日—3日放假,共3天。其中1月1日为法定假日,将2006年12月30日(星期六)、31日(星期日)两个公休日分别调至2007年1月2日、3日,2006年12月30日(星期六)、31日(星期日)上班。",
"expected": [
{ "date": "2007-01-01", "isOffDay": true },
{ "date": "2007-01-02", "isOffDay": true },
{ "date": "2007-01-03", "isOffDay": true },
{ "date": "2006-12-30", "isOffDay": false },
{ "date": "2006-12-31", "isOffDay": false }
]
},
{
"year": 2007,
"description": "2月18日—24日(即农历大年初一至初七)放假,共7天。其中18日、19日、20日为法定假日,将17日(星期六)、18日(星期日)、25日(星期日)三个公休日分别调至21日(星期三)、22日(星期四)、23日(星期五);24日(星期六)照常公休,17日、25日上班。",
"expected": [
{ "date": "2007-02-18", "isOffDay": true },
{ "date": "2007-02-19", "isOffDay": true },
{ "date": "2007-02-20", "isOffDay": true },
{ "date": "2007-02-21", "isOffDay": true },
{ "date": "2007-02-22", "isOffDay": true },
{ "date": "2007-02-23", "isOffDay": true },
{ "date": "2007-02-24", "isOffDay": true },
{ "date": "2007-02-17", "isOffDay": false },
{ "date": "2007-02-25", "isOffDay": false }
]
},
{
"year": 2007,
"description": "5月1日—7日放假,共7天。其中,1日、2日、3日为法定假日,将4月28日(星期六)、29日(星期日)两个公休日调至5月4日(星期五)、7日(星期一);5月5日(星期六)、6日(星期日)照常公休,4月28日、29日上班。",
"expected": [
{ "date": "2007-05-01", "isOffDay": true },
{ "date": "2007-05-02", "isOffDay": true },
{ "date": "2007-05-03", "isOffDay": true },
{ "date": "2007-05-04", "isOffDay": true },
{ "date": "2007-05-05", "isOffDay": true },
{ "date": "2007-05-06", "isOffDay": true },
{ "date": "2007-05-07", "isOffDay": true },
{ "date": "2007-04-28", "isOffDay": false },
{ "date": "2007-04-29", "isOffDay": false }
]
},
{
"year": 2007,
"description": "10月1日—7日放假,共7天。其中,1日、2日、3日为法定假日,将9月29日(星期六)、30日(星期日)两个公休日调至10月4日(星期四)、5日(星期五);10月6日(星期六)、7日(星期日)照常公休,9月29日、30日上班。",
"expected": [
{ "date": "2007-10-01", "isOffDay": true },
{ "date": "2007-10-02", "isOffDay": true },
{ "date": "2007-10-03", "isOffDay": true },
{ "date": "2007-10-04", "isOffDay": true },
{ "date": "2007-10-05", "isOffDay": true },
{ "date": "2007-10-06", "isOffDay": true },
{ "date": "2007-10-07", "isOffDay": true },
{ "date": "2007-09-29", "isOffDay": false },
{ "date": "2007-09-30", "isOffDay": false }
]
}
]
+436
View File
@@ -0,0 +1,436 @@
#!/usr/bin/env python3
"""Fetch holidays from gov.cn """
import argparse
import json
import re
from datetime import date, timedelta
from itertools import chain
from typing import Iterator, List, Optional, Tuple
import bs4
import requests
SEARCH_URL = "http://sousuo.gov.cn/s.htm"
PAPER_EXCLUDE = [
"http://www.gov.cn/zhengce/content/2014-09/29/content_9102.htm",
"http://www.gov.cn/zhengce/content/2015-02/09/content_9466.htm",
]
PAPER_INCLUDE = {
2015: ["http://www.gov.cn/zhengce/content/2015-05/13/content_9742.htm"]
}
PRE_PARSED_PAPERS = {
"http://www.gov.cn/zhengce/content/2015-05/13/content_9742.htm": [
{
"name": "抗日战争暨世界反法西斯战争胜利70周年纪念日",
"date": date(2015, 9, 3),
"isOffDay": True,
},
{
"name": "抗日战争暨世界反法西斯战争胜利70周年纪念日",
"date": date(2015, 9, 4),
"isOffDay": True,
},
{
"name": "抗日战争暨世界反法西斯战争胜利70周年纪念日",
"date": date(2015, 9, 5),
"isOffDay": True,
},
{
"name": "抗日战争暨世界反法西斯战争胜利70周年纪念日",
"date": date(2015, 9, 6),
"isOffDay": False,
},
],
"http://www.gov.cn/zhengce/content/2020-01/27/content_5472352.htm": [
{
"name": "春节",
"date": date(2020, 1, 31),
"isOffDay": True,
},
{
"name": "春节",
"date": date(2020, 2, 1),
"isOffDay": True,
},
{
"name": "春节",
"date": date(2020, 2, 2),
"isOffDay": True,
},
{
"name": "春节",
"date": date(2020, 2, 3),
"isOffDay": False,
},
],
}
def _raise_for_status_200(resp: requests.Response):
resp.raise_for_status()
if resp.status_code != 200:
raise requests.HTTPError(
"request failed: %d: %s" % (resp.status_code, resp.request.url),
response=resp,
)
def get_paper_urls(year: int) -> List[str]:
"""Find year related paper urls.
Args:
year (int): eg. 2018
Returns:
List[str]: Urls newlest first.
"""
resp = requests.get(
SEARCH_URL,
params={
"t": "paper",
"advance": "true",
"title": year,
"q": "假期",
"pcodeJiguan": "国办发明电",
"puborg": "国务院办公厅",
},
)
_raise_for_status_200(resp)
ret = re.findall(
r'<li class="res-list".*?<a href="(.+?)".*?</li>', resp.text, flags=re.S
)
ret = [i for i in ret if i not in PAPER_EXCLUDE]
ret += PAPER_INCLUDE.get(year, [])
ret.sort()
if not ret and date.today().year >= year:
raise RuntimeError("could not found papers for %d" % year)
return ret
def get_paper(url: str) -> str:
"""Extract paper text from url.
Args:
url (str): Paper url.
Returns:
str: Extracted paper text.
"""
assert re.match(
r"http://www.gov.cn/zhengce/content/\d{4}-\d{2}/\d{2}/content_\d+.htm", url
), "Site changed, need human verify"
response = requests.get(url)
_raise_for_status_200(response)
response.encoding = "utf-8"
soup = bs4.BeautifulSoup(response.text, features="html.parser")
container = soup.find("td", class_="b12c")
assert container, f"Can not get paper container from url: {url}"
ret = container.get_text().replace("\u3000\u3000", "\n")
assert ret, f"Can not get paper content from url: {url}"
return ret
def get_rules(paper: str) -> Iterator[Tuple[str, str]]:
"""Extract rules from paper.
Args:
paper (str): Paper text
Raises:
NotImplementedError: When find no rules.
Returns:
Iterator[Tuple[str, str]]: (name, description)
"""
lines: list = paper.splitlines()
lines = sorted(set(lines), key=lines.index)
count = 0
for i in chain(get_normal_rules(lines), get_patch_rules(lines)):
count += 1
yield i
if not count:
raise NotImplementedError(lines)
def get_normal_rules(lines: Iterator[str]) -> Iterator[Tuple[str, str]]:
"""Get normal holiday rule for a year
Args:
lines (Iterator[str]): paper content
Returns:
Iterator[Tuple[str, str]]: (name, description)
"""
for i in lines:
match = re.match(r"[一二三四五六七八九十]、(.+?):(.+)", i)
if match:
yield match.groups()
def get_patch_rules(lines: Iterator[str]) -> Iterator[Tuple[str, str]]:
"""Get holiday patch rule for existed holiday
Args:
lines (Iterator[str]): paper content
Returns:
Iterator[Tuple[str, str]]: (name, description)
"""
name = None
for i in lines:
match = re.match(r".*\d+年([^和、]{2,})(?:假期|放假).*安排", i)
if match:
name = match.group(1)
if not name:
continue
match = re.match(r"^[一二三四五六七八九十]、(.+)$", i)
if not match:
continue
description = match.group(1)
if re.match(r".*\d+月\d+日.*", description):
yield name, description
def _cast_int(value):
return int(value) if value else None
class DescriptionParser:
"""Parser for holiday shift description."""
def __init__(self, description: str, year: int):
self.description = description
self.year = year
self.date_history = list()
def parse(self) -> Iterator[dict]:
"""Generator for description parsing result.
Args:
year (int): Context year
"""
del self.date_history[:]
for i in re.split("[,。;]", self.description):
for j in SentenceParser(self, i).parse():
yield j
if not self.date_history:
raise NotImplementedError(self.description)
def get_date(self, year: Optional[int], month: Optional[int], day: int) -> date:
"""Get date in context.
Args:
year (Optional[int]): year
month (int): month
day (int): day
Returns:
date: Date result
"""
assert day, "No day specified"
# Special case: month inherit
if month is None:
month = self.date_history[-1].month
# Special case: 12 month may mean previous year
if (
year is None
and month == 12
and self.date_history
and max(self.date_history) < date(year=self.year, month=2, day=1)
):
year = self.year - 1
year = year or self.year
return date(year=year, month=month, day=day)
class SentenceParser:
"""Parser for holiday shift description sentence."""
def __init__(self, parent: DescriptionParser, sentence):
self.parent = parent
self.sentence = sentence
def extract_dates(self, text: str) -> Iterator[date]:
"""Extract date from text.
Args:
text (str): Text to extract
Returns:
Iterator[date]: Extracted dates.
"""
count = 0
text = text.replace("(", "").replace(")", "")
for i in chain(
*(method(self, text) for method in self.date_extraction_methods)
):
count += 1
is_seen = i in self.parent.date_history
self.parent.date_history.append(i)
if is_seen:
continue
yield i
if not count:
raise NotImplementedError(text)
def _extract_dates_1(self, value: str) -> Iterator[date]:
match = re.findall(r"(?:(\d+)年)?(?:(\d+)月)?(\d+)日", value)
for groups in match:
groups = [_cast_int(i) for i in groups]
assert len(groups) == 3, groups
yield self.parent.get_date(year=groups[0], month=groups[1], day=groups[2])
def _extract_dates_2(self, value: str) -> Iterator[date]:
value = re.sub(r".+?", "", value)
match = re.findall(
r"(?:(\d+)年)?(?:(\d+)月)?(\d+)日(?:至|-|—)(?:(\d+)年)?(?:(\d+)月)?(\d+)日", value
)
for groups in match:
groups = [_cast_int(i) for i in groups]
assert len(groups) == 6, groups
start = self.parent.get_date(year=groups[0], month=groups[1], day=groups[2])
end = self.parent.get_date(year=groups[3], month=groups[4], day=groups[5])
for i in range((end - start).days + 1):
yield start + timedelta(days=i)
def _extract_dates_3(self, value: str) -> Iterator[date]:
value = re.sub(r".+?", "", value)
match = re.findall(
r"(?:(\d+)年)?(?:(\d+)月)?(\d+)日(?:[^]+)?"
r"(?:、(?:(\d+)年)?(?:(\d+)月)?(\d+)日(?:[^]+)?)+",
value,
)
for groups in match:
groups = [_cast_int(i) for i in groups]
assert not (len(groups) % 3), groups
for i in range(0, len(groups), 3):
yield self.parent.get_date(
year=groups[i], month=groups[i + 1], day=groups[i + 2]
)
date_extraction_methods = [_extract_dates_1, _extract_dates_2, _extract_dates_3]
def parse(self) -> Iterator[dict]:
"""Parse days with memory
Args:
memory (set): Date memory
Returns:
Iterator[dict]: Days without name field.
"""
for method in self.parsing_methods:
for i in method(self):
yield i
def _parse_rest_1(self):
match = re.match(r"(.+)(放假|补休|调休|公休)+(?:\d+天)?$", self.sentence)
if match:
for i in self.extract_dates(match.group(1)):
yield {"date": i, "isOffDay": True}
def _parse_work_1(self):
match = re.match("(.+)上班$", self.sentence)
if match:
for i in self.extract_dates(match.group(1)):
yield {"date": i, "isOffDay": False}
def _parse_shift_1(self):
match = re.match("(.+)调至(.+)", self.sentence)
if match:
for i in self.extract_dates(match.group(1)):
yield {"date": i, "isOffDay": False}
for i in self.extract_dates(match.group(2)):
yield {"date": i, "isOffDay": True}
parsing_methods = [
_parse_rest_1,
_parse_work_1,
_parse_shift_1,
]
def parse_paper(year: int, url: str) -> Iterator[dict]:
"""Parse one paper
Args:
year (int): Year
url (str): Paper url
Returns:
Iterator[dict]: Days
"""
if url in PRE_PARSED_PAPERS:
yield from PRE_PARSED_PAPERS[url]
return
paper = get_paper(url)
rules = get_rules(paper)
ret = (
{"name": name, **i}
for name, description in rules
for i in DescriptionParser(description, year).parse()
)
try:
for i in ret:
yield i
except NotImplementedError as ex:
raise RuntimeError("Can not parse paper", url) from ex
def fetch_holiday(year: int):
"""Fetch holiday data."""
papers = get_paper_urls(year)
days = dict()
for k in (j for i in papers for j in parse_paper(year, i)):
days[k["date"]] = k
return {
"year": year,
"papers": papers,
"days": sorted(days.values(), key=lambda x: x["date"]),
}
def main():
parser = argparse.ArgumentParser()
parser.add_argument("year", type=int)
args = parser.parse_args()
year = args.year
print(
json.dumps(
fetch_holiday(year), indent=4, ensure_ascii=False, cls=CustomJSONEncoder
)
)
class CustomJSONEncoder(json.JSONEncoder):
"""Custom json encoder."""
def default(self, o):
# pylint:disable=method-hidden
if isinstance(o, date):
return o.isoformat()
return super().default(o)
if __name__ == "__main__":
main()
+56
View File
@@ -0,0 +1,56 @@
"""Test module `fetch_holidays`. """
import json
import pytest
from fetch import (
CustomJSONEncoder,
DescriptionParser,
get_paper,
get_paper_urls,
get_rules,
)
from filetools import _file_path
def test_get_paper_urls():
assert get_paper_urls(2019) == [
"http://www.gov.cn/zhengce/content/2018-12/06/content_5346276.htm",
"http://www.gov.cn/zhengce/content/2019-03/22/content_5375877.htm",
]
def test_get_rules():
assert list(
get_rules(
get_paper(
"http://www.gov.cn/zhengce/content/2019-03/22/content_5375877.htm"
)
)
) == [("劳动节", "2019年5月1日至4日放假调休,共4天。4月28日(星期日)、5月5日(星期日)上班。")]
def _normalize(iterable):
return sorted(
json.loads(json.dumps(list(iterable), cls=CustomJSONEncoder)),
key=lambda x: x["date"],
)
def _description_parsing_cases():
with open(
_file_path("description_parsing_cases.json"),
"r",
encoding="utf-8",
) as f:
return json.load(f)
@pytest.mark.parametrize("case", _description_parsing_cases())
def test_parse_description(case):
year, description, expected = case["year"], case["description"], case["expected"]
assert _normalize(DescriptionParser(description, year).parse()) == _normalize(
expected
), case
+10
View File
@@ -0,0 +1,10 @@
"""Tools for files. """
import os
__dirname__ = os.path.abspath(os.path.dirname(__file__))
def _file_path(*other):
return os.path.abspath(os.path.join(__dirname__, *other))
+86
View File
@@ -0,0 +1,86 @@
import datetime
from typing import Any, Iterator, Sequence, Text, Tuple
from icalendar import Event, Calendar, Timezone, TimezoneStandard
def _create_timezone():
tz = Timezone()
tz.add("TZID", "Asia/Shanghai")
tz_standard = TimezoneStandard()
tz_standard.add("DTSTART", datetime.datetime(1970, 1, 1))
tz_standard.add("TZOFFSETFROM", datetime.timedelta(hours=8))
tz_standard.add("TZOFFSETTO", datetime.timedelta(hours=8))
tz.add_component(tz_standard)
return tz
def _create_event(event_name, start, end):
# 创建事件/日程
event = Event()
event.add("SUMMARY", event_name)
event.add("DTSTART", start)
event.add("DTEND", end)
# 创建时间
event.add("DTSTAMP", start)
# UID保证唯一
event["UID"] = f"{start}/{end}/NateScarlet/holiday-cn"
return event
def _cast_date(v: Any) -> datetime.date:
if isinstance(v, datetime.date):
return v
if isinstance(v, str):
return datetime.date.fromisoformat(v)
raise NotImplementedError("can not convert to date: %s" % v)
def _iter_date_ranges(days: Sequence[dict]) -> Iterator[Tuple[dict, dict]]:
if len(days) == 0:
return
if len(days) == 1:
yield days[0], days[0]
return
fr, to = days[0], days[0]
for cur in days[1:]:
if (_cast_date(cur["date"]) - _cast_date(to["date"])).days == 1 and cur[
"isOffDay"
] == to["isOffDay"]:
to = cur
else:
yield fr, to
fr, to = cur, cur
yield fr, to
def generate_ics(days: Sequence[dict], filename: Text) -> None:
"""Generate ics from days."""
cal = Calendar()
cal.add("X-WR-CALNAME", "中国法定节假日")
cal.add("X-WR-CALDESC", "中国法定节假日数据,自动每日抓取国务院公告。")
cal.add("VERSION", "2.0")
cal.add("METHOD", "PUBLISH")
cal.add("CLASS", "PUBLIC")
cal.add_component(_create_timezone())
days = sorted(days, key=lambda x: x["date"])
for fr, to in _iter_date_ranges(days):
start = _cast_date(fr["date"])
end = _cast_date(to["date"]) + datetime.timedelta(days=1)
name = fr["name"] + "假期"
if not fr["isOffDay"]:
name = "上班(补" + name + ")"
cal.add_component(_create_event(name, start, end))
with open(filename, "wb") as f:
f.write(cal.to_ical())
+181
View File
@@ -0,0 +1,181 @@
#!/usr/bin/env python3
"""Script for updating data. """
import argparse
import json
import os
import re
import subprocess
from datetime import datetime, timedelta, tzinfo
from tempfile import mkstemp
from typing import Iterator
from zipfile import ZipFile
from tqdm import tqdm
from fetch import CustomJSONEncoder, fetch_holiday
from generate_ics import generate_ics
class ChinaTimezone(tzinfo):
"""Timezone of china."""
def tzname(self, dt):
return "UTC+8"
def utcoffset(self, dt):
return timedelta(hours=8)
def dst(self, dt):
return timedelta()
__dirname__ = os.path.abspath(os.path.dirname(__file__))
def _file_path(*other):
return os.path.join(__dirname__, *other)
def update_data(year: int) -> Iterator[str]:
"""Update and store data for a year."""
json_filename = _file_path(f"{year}.json")
ics_filename = _file_path(f"{year}.ics")
with open(json_filename, "w", encoding="utf-8", newline="\n") as f:
data = fetch_holiday(year)
json.dump(
dict(
(
(
"$schema",
"https://raw.githubusercontent.com/NateScarlet/holiday-cn/master/schema.json",
),
(
"$id",
f"https://raw.githubusercontent.com/NateScarlet/holiday-cn/master/{year}.json",
),
*data.items(),
)
),
f,
indent=4,
ensure_ascii=False,
cls=CustomJSONEncoder,
)
yield json_filename
generate_ics(data["days"], ics_filename)
yield ics_filename
def update_main_ics(fr_year, to_year):
all_days = []
for year in range(fr_year, to_year + 1):
filename = _file_path(f"{year}.json")
if not os.path.isfile(filename):
continue
with open(filename, "r", encoding="utf8") as inf:
data = json.loads(inf.read())
all_days.extend(data.get("days"))
filename = _file_path("holiday-cn.ics")
generate_ics(
all_days,
filename,
)
return filename
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--all",
action="store_true",
help="Update all years since 2007, default is this year and next year",
)
parser.add_argument(
"--release",
action="store_true",
help="create new release if repository data is not up to date",
)
args = parser.parse_args()
now = datetime.now(ChinaTimezone())
is_release = args.release
filenames = []
progress = tqdm(range(2007 if args.all else now.year, now.year + 2))
for i in progress:
progress.set_description(f"Updating {i} data")
filenames += list(update_data(i))
progress.set_description("Updating holiday-cn.ics")
filenames.append(update_main_ics(now.year - 4, now.year + 1))
print("")
subprocess.run(["hub", "add", *filenames], check=True)
diff = subprocess.run(
["hub", "diff", "--stat", "--cached", "*.json", "*.ics"],
check=True,
stdout=subprocess.PIPE,
encoding="utf-8",
).stdout
if not diff:
print("Already up to date.")
return
if not is_release:
print("Updated repository data, skip release since not specified `--release`")
return
subprocess.run(
[
"hub",
"commit",
"-m",
"chore(release): update holiday data",
"-m",
"[skip ci]",
],
check=True,
)
subprocess.run(["hub", "push"], check=True)
tag = now.strftime("%Y.%m.%d")
temp_note_fd, temp_note_name = mkstemp()
with open(temp_note_fd, "w", encoding="utf-8") as f:
f.write(tag + "\n\n```diff\n" + diff + "\n```\n")
os.makedirs(_file_path("dist"), exist_ok=True)
zip_path = _file_path("dist", f"holiday-cn-{tag}.zip")
pack_data(zip_path)
subprocess.run(
[
"hub",
"release",
"create",
"-F",
temp_note_name,
"-a",
f"{zip_path}#JSON数据",
tag,
],
check=True,
)
os.unlink(temp_note_name)
def pack_data(file):
"""Pack data json in zip file."""
zip_file = ZipFile(file, "w")
for i in os.listdir(__dirname__):
if not re.match(r"\d+\.json", i):
continue
zip_file.write(_file_path(i), i)
if __name__ == "__main__":
main()