由于业务需求,需要采集国家药品监督管理局上面的药物信息备案。拿到手这个任务,简单看了一下网页结构,不是小keys。直接到起手来撸代码,说时迟那时快,一分钟后运行了一下,直接给我返回了一串加密的js代码。???这是啥玩意呦,本以为只是一个简单的小任务,原来还有点文章在里面。
别的不说,开始埋头苦干,花了大概2个小时的时间,js断点调试,修改js源代码,还是没有找到加密的方式。不得不说,中国开头的网站的反扒还是有点东西的。老大又急着要数据,我能也是秉持着人道主义原则,先放弃使用requests,直接使用selenium来采集了。代码没什么技术含量,就是简单的一些操作应用,看看点个赞就好。
下面直接贴代码,采用selenium,数据保存到mysql,这边只贴下GCP信息的代码块,其他的品种类似,可以自己尝试着写写。
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pymysql
import time
import re
from lxml import etree
from bs4 import BeautifulSoup
chrome_options = Options()
# chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('disable-infobars')
driver = webdriver.Chrome('D:\\chromedriver_win32\\chromedriver.exe', chrome_options=chrome_options)
conn = pymysql.connect(host='xxx', user='xxx', passwd='xxx', db='xxx',charset='utf8')
cur = conn.cursor()
headers = {
"Connection": "keep-alive",
"Host": "app1.sfda.gov.cn",
"Pragma": "no-cache",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
}
def get_all_url():
base_url = "http://app1.sfda.gov.cn/datasearchcnda/face3/search.jsp?"
for i in range(1, 104):
data = "tableId=19&State=1&bcId=152904843704682622352673850395&State=1&curstart={0}&State=1&tableName=TABLE19&State=1&viewtitleName=COLUMN121&State=1&tableView=%25E8%258D%25AF%25E7%2589%25A9%25E4%25B8%25B4%25E5%25BA%258A%25E8%25AF%2595%25E9%25AA%258C%25E6%259C%25BA%25E6%259E%2584%25E5%2590%258D%25E5%258D%2595&State=1&cid=0&State=1&ytableId=0&State=1&searchType=search&State=1".format(str(i))
url = base_url + data
time.sleep(2.5)
driver.get(url)
response = driver.page_source
if "Forbidden" in response or "请检查您的互联网连接是否正常" in response:
driver.refresh()
response = driver.page_source
print("正在采集第 %s页" % i)
html = etree.HTML(response)
trs = html.xpath("/html/body/table[2]//tr")
for tr in trs:
href = "".join(tr.xpath(".//a/@href"))
if href:
detail_url = "http://app1.sfda.gov.cn/datasearchcnda/face3/" + "".join(re.findall("javascript.*?'(.*?)'", href, re.S))
n = cur.execute("select collect_url from original_065_basetable where collect_url=%s",detail_url)
if n == 0:
sql = "insert into `original_065_basetable`(collect_url,status) values (%s,%s)"
params = (detail_url,"1")
cur.execute(sql,params)
def get_all_data():
cur.execute("SELECT `collect_url` FROM `original_065_basetable` where status = 1")
for row in cur.fetchall():
url = row[0]
time.sleep(0.5)
driver.get(url)
response = driver.page_source
if "Forbidden" in response or "请检查您的互联网连接是否正常" in response:
driver.refresh()
response = driver.page_source
soup = BeautifulSoup(response, "html.parser")
proj_id = url.split("&Id=")[1]
try:
a1 = soup.find(text="证书编号").findNext("td").text.strip() # 证书编号
except:
driver.refresh()
response = driver.page_source
soup = BeautifulSoup(response, "html.parser")
proj_id = url.split("&Id=")[1]
a1 = soup.find(text="证书编号").findNext("td").text.strip() # 证书编号
a2 = soup.find(text="医疗机构名称").findNext("td").text.strip() # 医疗机构名称
print(a2)
a3 = soup.find(text="地址").findNext("td").text.strip() # 地址
a4 = soup.find(text="省市").findNext("td").text.strip() # 省市
a5 = soup.find(text="认证日期").findNext("td").text.strip() # 认证日期
a6 = soup.find(text="有效期截止日").findNext("td").text.strip() # 有效期截止日
a7 = soup.find(text="认定专业").findNext("td").text.strip() # 认定专业
sql = 'insert into original_007_baseinfo(proj_url_id,collect_time,collect_url,k007_000001,k007_000002,k007_000003,k007_000004,k007_000005,k007_000006,k007_000007,regname,regtime,status)values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
params = (proj_id, time.strftime('%Y%m%d'), url, a1, a2, a3, a4, a5, a6, a7, 'zuohaojie', time.strftime('%Y%m%d'), '1')
cur.execute(sql, params)
cur.execute("update `original_065_basetable` set status = 0 where collect_url=%s",url)
cur.close()
if __name__ == '__main__':
# get_all_url() ### 采集全部网址,
get_all_data() # 录入原始库