fix: follow gov.cn api change
This commit is contained in:
parent
8e4cf83082
commit
cd791e1893
|
|
@ -11,17 +11,16 @@ from typing import Iterator, List, Optional, Tuple
|
||||||
import bs4
|
import bs4
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
SEARCH_URL = "http://sousuo.gov.cn/s.htm"
|
|
||||||
PAPER_EXCLUDE = [
|
PAPER_EXCLUDE = [
|
||||||
"http://www.gov.cn/zhengce/content/2014-09/29/content_9102.htm",
|
"http://www.gov.cn/zhengce/zhengceku/2014-09/29/content_9102.htm",
|
||||||
"http://www.gov.cn/zhengce/content/2015-02/09/content_9466.htm",
|
"http://www.gov.cn/zhengce/zhengceku/2015-02/09/content_9466.htm",
|
||||||
]
|
]
|
||||||
PAPER_INCLUDE = {
|
PAPER_INCLUDE = {
|
||||||
2015: ["http://www.gov.cn/zhengce/content/2015-05/13/content_9742.htm"]
|
2015: ["http://www.gov.cn/zhengce/zhengceku/2015-05/13/content_9742.htm"]
|
||||||
}
|
}
|
||||||
|
|
||||||
PRE_PARSED_PAPERS = {
|
PRE_PARSED_PAPERS = {
|
||||||
"http://www.gov.cn/zhengce/content/2015-05/13/content_9742.htm": [
|
"http://www.gov.cn/zhengce/zhengceku/2015-05/13/content_9742.htm": [
|
||||||
{
|
{
|
||||||
"name": "抗日战争暨世界反法西斯战争胜利70周年纪念日",
|
"name": "抗日战争暨世界反法西斯战争胜利70周年纪念日",
|
||||||
"date": date(2015, 9, 3),
|
"date": date(2015, 9, 3),
|
||||||
|
|
@ -43,7 +42,7 @@ PRE_PARSED_PAPERS = {
|
||||||
"isOffDay": False,
|
"isOffDay": False,
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
"http://www.gov.cn/zhengce/content/2020-01/27/content_5472352.htm": [
|
"http://www.gov.cn/zhengce/zhengceku/2020-01/27/content_5472352.htm": [
|
||||||
{
|
{
|
||||||
"name": "春节",
|
"name": "春节",
|
||||||
"date": date(2020, 1, 31),
|
"date": date(2020, 1, 31),
|
||||||
|
|
@ -77,6 +76,40 @@ def _raise_for_status_200(resp: requests.Response):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_paper_urls(year: int) -> Iterator[str]:
|
||||||
|
has_next_page = True
|
||||||
|
page_index = 0
|
||||||
|
while has_next_page:
|
||||||
|
resp = requests.get(
|
||||||
|
"https://sousuo.www.gov.cn/search-gov/data",
|
||||||
|
params={
|
||||||
|
"t": "zhengcelibrary_gw",
|
||||||
|
"p": page_index,
|
||||||
|
"n": 5,
|
||||||
|
"q": "假期 %d" % (year,),
|
||||||
|
"pcodeJiguan": "国办发明电",
|
||||||
|
"puborg": "国务院办公厅",
|
||||||
|
"filetype": "通知",
|
||||||
|
"sort": "pubtime",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
_raise_for_status_200(resp)
|
||||||
|
data = resp.json()
|
||||||
|
if data["code"] == 1001:
|
||||||
|
# no match
|
||||||
|
return
|
||||||
|
assert data["code"] == 200, "%s: %s: %s" % (
|
||||||
|
resp.url,
|
||||||
|
data["code"],
|
||||||
|
data["msg"],
|
||||||
|
)
|
||||||
|
for i in data["searchVO"]["listVO"]:
|
||||||
|
if str(year) in i["title"]:
|
||||||
|
yield i["url"]
|
||||||
|
page_index += 1
|
||||||
|
has_next_page = page_index < data["searchVO"]["totalpage"]
|
||||||
|
|
||||||
|
|
||||||
def get_paper_urls(year: int) -> List[str]:
|
def get_paper_urls(year: int) -> List[str]:
|
||||||
"""Find year related paper urls.
|
"""Find year related paper urls.
|
||||||
|
|
||||||
|
|
@ -84,29 +117,14 @@ def get_paper_urls(year: int) -> List[str]:
|
||||||
year (int): eg. 2018
|
year (int): eg. 2018
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List[str]: Urls, newlest first.
|
List[str]: Urls, sort by publish time.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
resp = requests.get(
|
ret = [i for i in _get_paper_urls(year) if i not in PAPER_EXCLUDE]
|
||||||
SEARCH_URL,
|
|
||||||
params={
|
|
||||||
"t": "paper",
|
|
||||||
"advance": "true",
|
|
||||||
"title": year,
|
|
||||||
"q": "假期",
|
|
||||||
"pcodeJiguan": "国办发明电",
|
|
||||||
"puborg": "国务院办公厅",
|
|
||||||
},
|
|
||||||
)
|
|
||||||
_raise_for_status_200(resp)
|
|
||||||
ret = re.findall(
|
|
||||||
r'<li class="res-list".*?<a href="(.+?)".*?</li>', resp.text, flags=re.S
|
|
||||||
)
|
|
||||||
ret = [i for i in ret if i not in PAPER_EXCLUDE]
|
|
||||||
ret += PAPER_INCLUDE.get(year, [])
|
ret += PAPER_INCLUDE.get(year, [])
|
||||||
ret.sort()
|
ret.sort()
|
||||||
if not ret and date.today().year >= year:
|
if not ret and date.today().year >= year:
|
||||||
raise RuntimeError("could not found papers for %d" % year)
|
raise RuntimeError("could not found papers for %d" % (year,))
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -121,17 +139,23 @@ def get_paper(url: str) -> str:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
assert re.match(
|
assert re.match(
|
||||||
r"http://www.gov.cn/zhengce/content/\d{4}-\d{2}/\d{2}/content_\d+.htm", url
|
r"https?://www.gov.cn/zhengce/(zhengceku|content)/\d{4}-\d{2}/\d{2}/content_\d+.htm",
|
||||||
), "Site changed, need human verify"
|
url,
|
||||||
|
), "site changed, need human verify: %s" % (url,)
|
||||||
|
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
_raise_for_status_200(response)
|
_raise_for_status_200(response)
|
||||||
response.encoding = "utf-8"
|
response.encoding = "utf-8"
|
||||||
soup = bs4.BeautifulSoup(response.text, features="html.parser")
|
soup = bs4.BeautifulSoup(response.text, features="html.parser")
|
||||||
container = soup.find(id="UCAP-CONTENT")
|
container = soup.find(id="UCAP-CONTENT")
|
||||||
assert container, f"Can not get paper container from url: {url}"
|
assert isinstance(
|
||||||
ret = "\n".join((i.get_text() for i in container.find_all("p")))
|
container, bs4.Tag
|
||||||
assert ret, f"Can not get paper content from url: {url}"
|
), f"Can not get paper container from url: {url}"
|
||||||
|
p = bs4.BeautifulSoup(
|
||||||
|
container.decode().replace("<br/>", "</p><p>"), features="html.parser"
|
||||||
|
).find_all("p")
|
||||||
|
ret = "\n".join((i.get_text().strip() for i in p))
|
||||||
|
assert ret, f"can not get paper content from url: {url}"
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -16,8 +16,8 @@ from filetools import workspace_path
|
||||||
|
|
||||||
def test_get_paper_urls():
|
def test_get_paper_urls():
|
||||||
assert get_paper_urls(2019) == [
|
assert get_paper_urls(2019) == [
|
||||||
"http://www.gov.cn/zhengce/content/2018-12/06/content_5346276.htm",
|
"http://www.gov.cn/zhengce/zhengceku/2018-12/06/content_5346276.htm",
|
||||||
"http://www.gov.cn/zhengce/content/2019-03/22/content_5375877.htm",
|
"http://www.gov.cn/zhengce/zhengceku/2019-03/22/content_5375877.htm",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -25,7 +25,7 @@ def test_get_rules():
|
||||||
assert list(
|
assert list(
|
||||||
get_rules(
|
get_rules(
|
||||||
get_paper(
|
get_paper(
|
||||||
"http://www.gov.cn/zhengce/content/2019-03/22/content_5375877.htm"
|
"http://www.gov.cn/zhengce/zhengceku/2019-03/22/content_5375877.htm"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
) == [("劳动节", "2019年5月1日至4日放假调休,共4天。4月28日(星期日)、5月5日(星期日)上班。")]
|
) == [("劳动节", "2019年5月1日至4日放假调休,共4天。4月28日(星期日)、5月5日(星期日)上班。")]
|
||||||
|
|
@ -35,7 +35,7 @@ def test_get_rules_2023():
|
||||||
got = list(
|
got = list(
|
||||||
get_rules(
|
get_rules(
|
||||||
get_paper(
|
get_paper(
|
||||||
"http://www.gov.cn/zhengce/content/2022-12/08/content_5730844.htm"
|
"http://www.gov.cn/zhengce/zhengceku/2022-12/08/content_5730844.htm"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user