holiday-cn/fetch_holidays.py
2019-03-06 21:41:31 +08:00

70 lines
1.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""Fetch holidays from gov.cn """
import argparse
import re
import bs4
import requests
SEARCH_URL = ('http://sousuo.gov.cn/s.htm'
'?t=paper&advance=true&sort=&title={year}+%E8%8A%82%E5%81%87%E6%97%A5'
'&puborg=%E5%9B%BD%E5%8A%A1%E9%99%A2%E5%8A%9E%E5%85%AC%E5%8E%85'
'&pcodeJiguan=%E5%9B%BD%E5%8A%9E%E5%8F%91%E6%98%8E%E7%94%B5')
def get_paper_urls(year):
url = SEARCH_URL.format(year=year)
body = requests.get(url).text
ret = re.findall(
r'<li class="res-list".*?<a href="(.+?)".*?</li>', body, flags=re.S)
assert all(
re.match(
r'http://www.gov.cn/zhengce/content/\d{4}-\d{2}/\d{2}/content_\d+.htm', i)
for i in ret), 'Site changed, need human verify'
return ret
def get_paper(url):
response = requests.get(url)
response.encoding = 'utf-8'
soup = bs4.BeautifulSoup(response.text, features='html.parser')
container = soup.find('td', class_='b12c')
assert container, f'Can not get paper container from url: {url}'
ret = container.get_text()
assert ret, f'Can not get paper context from url: {url}'
return ret
def get_rules(paper: str):
lines: list = paper.splitlines()
for i in sorted(set(lines), key=lines.index):
match = re.match(r'[一二三四五六七八九十]、(.+?)(.+)', i)
if match:
yield match.groups()
def parse_holiday_description(year, description):
pass
def parse_paper(url):
pass
def main():
parser = argparse.ArgumentParser()
parser.add_argument('year')
args = parser.parse_args()
papers = get_paper_urls(args.year)
for i in papers:
paper = get_paper(i)
[print(i) for i in get_rules(paper)]
if __name__ == '__main__':
main()