feat: check http status code

This commit is contained in:
NateScarlet 2021-09-24 20:33:12 +08:00
parent 68d591d058
commit b673f3770c
No known key found for this signature in database
GPG Key ID: 5C242793B070309C

View File

@ -46,6 +46,15 @@ PRE_PARSED_PAPERS = {
} }
def _raise_for_status_200(resp: requests.Response):
resp.raise_for_status()
if resp.status_code != 200:
raise requests.HTTPError(
"request failed: %s: %d" % (resp.request.url, resp.status_code),
response=resp,
)
def get_paper_urls(year: int) -> List[str]: def get_paper_urls(year: int) -> List[str]:
"""Find year related paper urls. """Find year related paper urls.
@ -56,7 +65,7 @@ def get_paper_urls(year: int) -> List[str]:
List[str]: Urls newlest first. List[str]: Urls newlest first.
""" """
body = requests.get( resp = requests.get(
SEARCH_URL, SEARCH_URL,
params={ params={
"t": "paper", "t": "paper",
@ -66,9 +75,10 @@ def get_paper_urls(year: int) -> List[str]:
"pcodeJiguan": "国办发明电", "pcodeJiguan": "国办发明电",
"puborg": "国务院办公厅", "puborg": "国务院办公厅",
}, },
).text )
_raise_for_status_200(resp)
ret = re.findall( ret = re.findall(
r'<li class="res-list".*?<a href="(.+?)".*?</li>', body, flags=re.S r'<li class="res-list".*?<a href="(.+?)".*?</li>', resp.text, flags=re.S
) )
ret = [i for i in ret if i not in PAPER_EXCLUDE] ret = [i for i in ret if i not in PAPER_EXCLUDE]
ret += PAPER_INCLUDE.get(year, []) ret += PAPER_INCLUDE.get(year, [])
@ -93,6 +103,7 @@ def get_paper(url: str) -> str:
), "Site changed, need human verify" ), "Site changed, need human verify"
response = requests.get(url) response = requests.get(url)
_raise_for_status_200(response)
response.encoding = "utf-8" response.encoding = "utf-8"
soup = bs4.BeautifulSoup(response.text, features="html.parser") soup = bs4.BeautifulSoup(response.text, features="html.parser")
container = soup.find("td", class_="b12c") container = soup.find("td", class_="b12c")