#!/usr/bin/env python3
"""Fetch holidays from gov.cn """
import argparse
import re
import bs4
import requests
SEARCH_URL = ('http://sousuo.gov.cn/s.htm'
'?t=paper&advance=true&sort=&title={year}+%E8%8A%82%E5%81%87%E6%97%A5'
'&puborg=%E5%9B%BD%E5%8A%A1%E9%99%A2%E5%8A%9E%E5%85%AC%E5%8E%85'
'&pcodeJiguan=%E5%9B%BD%E5%8A%9E%E5%8F%91%E6%98%8E%E7%94%B5')
def get_paper_urls(year):
url = SEARCH_URL.format(year=year)
body = requests.get(url).text
ret = re.findall(
r'
', body, flags=re.S)
assert all(
re.match(
r'http://www.gov.cn/zhengce/content/\d{4}-\d{2}/\d{2}/content_\d+.htm', i)
for i in ret), 'Site changed, need human verify'
return ret
def get_paper(url):
response = requests.get(url)
response.encoding = 'utf-8'
soup = bs4.BeautifulSoup(response.text, features='html.parser')
container = soup.find('td', class_='b12c')
assert container, f'Can not get paper container from url: {url}'
ret = container.get_text()
assert ret, f'Can not get paper context from url: {url}'
return ret
def get_rules(paper: str):
lines: list = paper.splitlines()
for i in sorted(set(lines), key=lines.index):
match = re.match(r'[一二三四五六七八九十]、(.+?):(.+)', i)
if match:
yield match.groups()
def parse_holiday_description(year, description):
pass
def parse_paper(url):
pass
def main():
parser = argparse.ArgumentParser()
parser.add_argument('year')
args = parser.parse_args()
papers = get_paper_urls(args.year)
for i in papers:
paper = get_paper(i)
[print(i) for i in get_rules(paper)]
if __name__ == '__main__':
main()