From cd791e189311ea988f6386ec259f616bf0aca8ee Mon Sep 17 00:00:00 2001
From: NateScarlet
Date: Thu, 17 Aug 2023 22:09:12 +0800
Subject: [PATCH] fix: follow gov.cn api change
---
scripts/fetch.py | 82 ++++++++++++++++++++++++++++---------------
scripts/fetch_test.py | 8 ++---
2 files changed, 57 insertions(+), 33 deletions(-)
diff --git a/scripts/fetch.py b/scripts/fetch.py
index 85ad881..eaab11d 100644
--- a/scripts/fetch.py
+++ b/scripts/fetch.py
@@ -11,17 +11,16 @@ from typing import Iterator, List, Optional, Tuple
import bs4
import requests
-SEARCH_URL = "http://sousuo.gov.cn/s.htm"
PAPER_EXCLUDE = [
- "http://www.gov.cn/zhengce/content/2014-09/29/content_9102.htm",
- "http://www.gov.cn/zhengce/content/2015-02/09/content_9466.htm",
+ "http://www.gov.cn/zhengce/zhengceku/2014-09/29/content_9102.htm",
+ "http://www.gov.cn/zhengce/zhengceku/2015-02/09/content_9466.htm",
]
PAPER_INCLUDE = {
- 2015: ["http://www.gov.cn/zhengce/content/2015-05/13/content_9742.htm"]
+ 2015: ["http://www.gov.cn/zhengce/zhengceku/2015-05/13/content_9742.htm"]
}
PRE_PARSED_PAPERS = {
- "http://www.gov.cn/zhengce/content/2015-05/13/content_9742.htm": [
+ "http://www.gov.cn/zhengce/zhengceku/2015-05/13/content_9742.htm": [
{
"name": "抗日战争暨世界反法西斯战争胜利70周年纪念日",
"date": date(2015, 9, 3),
@@ -43,7 +42,7 @@ PRE_PARSED_PAPERS = {
"isOffDay": False,
},
],
- "http://www.gov.cn/zhengce/content/2020-01/27/content_5472352.htm": [
+ "http://www.gov.cn/zhengce/zhengceku/2020-01/27/content_5472352.htm": [
{
"name": "春节",
"date": date(2020, 1, 31),
@@ -77,6 +76,40 @@ def _raise_for_status_200(resp: requests.Response):
)
+def _get_paper_urls(year: int) -> Iterator[str]:
+ has_next_page = True
+ page_index = 0
+ while has_next_page:
+ resp = requests.get(
+ "https://sousuo.www.gov.cn/search-gov/data",
+ params={
+ "t": "zhengcelibrary_gw",
+ "p": page_index,
+ "n": 5,
+ "q": "假期 %d" % (year,),
+ "pcodeJiguan": "国办发明电",
+ "puborg": "国务院办公厅",
+ "filetype": "通知",
+ "sort": "pubtime",
+ },
+ )
+ _raise_for_status_200(resp)
+ data = resp.json()
+ if data["code"] == 1001:
+ # no match
+ return
+ assert data["code"] == 200, "%s: %s: %s" % (
+ resp.url,
+ data["code"],
+ data["msg"],
+ )
+ for i in data["searchVO"]["listVO"]:
+ if str(year) in i["title"]:
+ yield i["url"]
+ page_index += 1
+ has_next_page = page_index < data["searchVO"]["totalpage"]
+
+
def get_paper_urls(year: int) -> List[str]:
"""Find year related paper urls.
@@ -84,29 +117,14 @@ def get_paper_urls(year: int) -> List[str]:
year (int): eg. 2018
Returns:
- List[str]: Urls, newlest first.
+ List[str]: Urls, sort by publish time.
"""
- resp = requests.get(
- SEARCH_URL,
- params={
- "t": "paper",
- "advance": "true",
- "title": year,
- "q": "假期",
- "pcodeJiguan": "国办发明电",
- "puborg": "国务院办公厅",
- },
- )
- _raise_for_status_200(resp)
- ret = re.findall(
- r'', resp.text, flags=re.S
- )
- ret = [i for i in ret if i not in PAPER_EXCLUDE]
+ ret = [i for i in _get_paper_urls(year) if i not in PAPER_EXCLUDE]
ret += PAPER_INCLUDE.get(year, [])
ret.sort()
if not ret and date.today().year >= year:
- raise RuntimeError("could not found papers for %d" % year)
+ raise RuntimeError("could not found papers for %d" % (year,))
return ret
@@ -121,17 +139,23 @@ def get_paper(url: str) -> str:
"""
assert re.match(
- r"http://www.gov.cn/zhengce/content/\d{4}-\d{2}/\d{2}/content_\d+.htm", url
- ), "Site changed, need human verify"
+ r"https?://www.gov.cn/zhengce/(zhengceku|content)/\d{4}-\d{2}/\d{2}/content_\d+.htm",
+ url,
+ ), "site changed, need human verify: %s" % (url,)
response = requests.get(url)
_raise_for_status_200(response)
response.encoding = "utf-8"
soup = bs4.BeautifulSoup(response.text, features="html.parser")
container = soup.find(id="UCAP-CONTENT")
- assert container, f"Can not get paper container from url: {url}"
- ret = "\n".join((i.get_text() for i in container.find_all("p")))
- assert ret, f"Can not get paper content from url: {url}"
+ assert isinstance(
+ container, bs4.Tag
+ ), f"Can not get paper container from url: {url}"
+ p = bs4.BeautifulSoup(
+ container.decode().replace("
", "
"), features="html.parser"
+ ).find_all("p")
+ ret = "\n".join((i.get_text().strip() for i in p))
+ assert ret, f"can not get paper content from url: {url}"
return ret
diff --git a/scripts/fetch_test.py b/scripts/fetch_test.py
index 7c5496a..159ec60 100644
--- a/scripts/fetch_test.py
+++ b/scripts/fetch_test.py
@@ -16,8 +16,8 @@ from filetools import workspace_path
def test_get_paper_urls():
assert get_paper_urls(2019) == [
- "http://www.gov.cn/zhengce/content/2018-12/06/content_5346276.htm",
- "http://www.gov.cn/zhengce/content/2019-03/22/content_5375877.htm",
+ "http://www.gov.cn/zhengce/zhengceku/2018-12/06/content_5346276.htm",
+ "http://www.gov.cn/zhengce/zhengceku/2019-03/22/content_5375877.htm",
]
@@ -25,7 +25,7 @@ def test_get_rules():
assert list(
get_rules(
get_paper(
- "http://www.gov.cn/zhengce/content/2019-03/22/content_5375877.htm"
+ "http://www.gov.cn/zhengce/zhengceku/2019-03/22/content_5375877.htm"
)
)
) == [("劳动节", "2019年5月1日至4日放假调休,共4天。4月28日(星期日)、5月5日(星期日)上班。")]
@@ -35,7 +35,7 @@ def test_get_rules_2023():
got = list(
get_rules(
get_paper(
- "http://www.gov.cn/zhengce/content/2022-12/08/content_5730844.htm"
+ "http://www.gov.cn/zhengce/zhengceku/2022-12/08/content_5730844.htm"
)
)
)