化妆品生产许可证关系系统服务平台
因此对其尝试get爬取
url = "http://scxk.nmpa.gov.cn:81/xk/"
head = {
#模拟浏览器头部信息
"User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64;rv: 85.0) Gecko / 20100101Firefox / 85.0" #伪装成浏览器访问
}
response_text = requests.get(url=url, headers=head, ).text
print(response_text)
分析返回结果可发现其中并未包含所需信息
因此假设其为ajax的异步刷新进行分析
使用F12网络工具对其进行抓包
分析得:
可发现post请求中json形式文件包含着所需信息
因此以post形式对其进行爬取
url = "http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList"
head = {
#模拟浏览器头部信息
"User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64;rv: 85.0) Gecko / 20100101Firefox / 85.0" #伪装成浏览器访问
}
response_json = requests.post(url=url, headers=head, ).json()
print(response_json)
{
"filesize": "",
"keyword": "",
"list": [
{
"ID": "c3854166c00f46b5b29fe2a55d3df929",
"EPS_NAME": "广东彤姿生物科技有限公司",
"PRODUCT_SN": "粤妆20210037",
"CITY_CODE": null,
"XK_COMPLETE_DATE": {
"date": 3,
"day": 1,
"hours": 0,
"minutes": 0,
"month": 7,
"nanos": 0,
"seconds": 0,
"time": 7308115200000,
"timezoneOffset": -480,
"year": 301
},
"XK_DATE": "2026-02-02",
"QF_MANAGER_NAME": "广东省药品监督管理局",
"BUSINESS_LICENSE_NUMBER": "91440607MA55AD6D5C",
"XC_DATE": "2201-08-03",
"NUM_": 1
},
{
"ID": "c3ae6ecabbcf4ba68038321b80819753",
"EPS_NAME": "佛山拜澳生物科技有限公司",
"PRODUCT_SN": "粤妆20160229",
"CITY_CODE": null,
"XK_COMPLETE_DATE": {
"date": 3,
"day": 1,
"hours": 0,
"minutes": 0,
"month": 7,
"nanos": 0,
"seconds": 0,
"time": 7308115200000,
"timezoneOffset": -480,
"year": 301
},
"XK_DATE": "2026-02-02",
"QF_MANAGER_NAME": "广东省药品监督管理局",
"BUSINESS_LICENSE_NUMBER": "91440607579693554T",
"XC_DATE": "2201-08-03",
"NUM_": 2
},
{
"ID": "ed59438f34ae47e794f4c7ee5137c1f7",
"EPS_NAME": "海南京润珍珠生物技术股份有限公司",
"PRODUCT_SN": "琼妆20160001",
"CITY_CODE": "311",
"XK_COMPLETE_DATE": {
"date": 25,
"day": 0,
"hours": 0,
"minutes": 0,
"month": 3,
"nanos": 0,
"seconds": 0,
"time": 1619280000000,
"timezoneOffset": -480,
"year": 121
},
"XK_DATE": "2026-04-25",
"QF_MANAGER_NAME": "海南省药品监督管理局",
"BUSINESS_LICENSE_NUMBER": "91460000294121210Y",
"XC_DATE": "2021-04-25",
"NUM_": 3
},
{
"ID": "a810f850c54f4cf7a002057cfb4ec279",
"EPS_NAME": "滁州向日葵药业有限公司",
"PRODUCT_SN": "皖妆20210005",
"CITY_CODE": "220",
"XK_COMPLETE_DATE": {
"date": 18,
"day": 4,
"hours": 0,
"minutes": 0,
"month": 1,
"nanos": 0,
"seconds": 0,
"time": 1613577600000,
"timezoneOffset": -480,
"year": 121
},
"XK_DATE": "2026-02-17",
"QF_MANAGER_NAME": "安徽省药品监督管理局",
"BUSINESS_LICENSE_NUMBER": "91341171MA2UE0W596",
"XC_DATE": "2021-02-18",
"NUM_": 4
},
{
"ID": "73bb06d774f44c2b9d7c006be3711718",
"EPS_NAME": "扬州倍加洁日化有限公司",
"PRODUCT_SN": "苏妆20210011",
"CITY_CODE": "86",
"XK_COMPLETE_DATE": {
"date": 18,
"day": 4,
"hours": 0,
"minutes": 0,
"month": 1,
"nanos": 0,
"seconds": 0,
"time": 1613577600000,
"timezoneOffset": -480,
"year": 121
},
"XK_DATE": "2026-02-17",
"QF_MANAGER_NAME": "江苏省药品监督管理局",
"BUSINESS_LICENSE_NUMBER": "91321000760545492E",
"XC_DATE": "2021-02-18",
"NUM_": 5
},
{
"ID": "d51920e18414449fa2bda604a2a6b93f",
"EPS_NAME": "克劳丽化妆品(南通)有限公司",
"PRODUCT_SN": "苏妆20210010",
"CITY_CODE": "85",
"XK_COMPLETE_DATE": {
"date": 18,
"day": 4,
"hours": 0,
"minutes": 0,
"month": 1,
"nanos": 0,
"seconds": 0,
"time": 1613577600000,
"timezoneOffset": -480,
"year": 121
},
"XK_DATE": "2026-02-17",
"QF_MANAGER_NAME": "江苏省药品监督管理局",
"BUSINESS_LICENSE_NUMBER": "91320623MA24WAKU9P",
"XC_DATE": "2021-02-18",
"NUM_": 6
},
{
"ID": "6321fa3a8cad4edba7b5597c3fdea52e",
"EPS_NAME": "广东永佳日化实业有限公司",
"PRODUCT_SN": "粤妆20210050",
"CITY_CODE": null,
"XK_COMPLETE_DATE": {
"date": 10,
"day": 3,
"hours": 0,
"minutes": 0,
"month": 1,
"nanos": 0,
"seconds": 0,
"time": 1612886400000,
"timezoneOffset": -480,
"year": 121
},
"XK_DATE": "2026-02-09",
"QF_MANAGER_NAME": "广东省药品监督管理局",
"BUSINESS_LICENSE_NUMBER": "91440514071881319E",
"XC_DATE": "2021-02-10",
"NUM_": 7
},
{
"ID": "c2cf1364781447e1a3b6dcd90bda6fff",
"EPS_NAME": "江西初美化妆品有限公司",
"PRODUCT_SN": "赣妆20180013",
"CITY_CODE": "93",
"XK_COMPLETE_DATE": {
"date": 10,
"day": 3,
"hours": 0,
"minutes": 0,
"month": 1,
"nanos": 0,
"seconds": 0,
"time": 1612886400000,
"timezoneOffset": -480,
"year": 121
},
"XK_DATE": "2023-07-17",
"QF_MANAGER_NAME": "江西省药品监督管理局",
"BUSINESS_LICENSE_NUMBER": "91360121MA37RM989B",
"XC_DATE": "2021-02-10",
"NUM_": 8
},
{
"ID": "23e8220096bf40f99ed1d453824b729d",
"EPS_NAME": "江西珍视明药业有限公司",
"PRODUCT_SN": "赣妆20170002",
"CITY_CODE": "97",
"XK_COMPLETE_DATE": {
"date": 10,
"day": 3,
"hours": 0,
"minutes": 0,
"month": 1,
"nanos": 0,
"seconds": 0,
"time": 1612886400000,
"timezoneOffset": -480,
"year": 121
},
"XK_DATE": "2022-01-05",
"QF_MANAGER_NAME": "江西省药品监督管理局",
"BUSINESS_LICENSE_NUMBER": "91361003789743169E",
"XC_DATE": "2021-02-10",
"NUM_": 9
},
{
"ID": "6626aa180dca43f7b10c45a3dd91c13a",
"EPS_NAME": "中山新妍化妆品有限公司",
"PRODUCT_SN": "粤妆20180230",
"CITY_CODE": null,
"XK_COMPLETE_DATE": {
"date": 9,
"day": 2,
"hours": 0,
"minutes": 0,
"month": 1,
"nanos": 0,
"seconds": 0,
"time": 1612800000000,
"timezoneOffset": -480,
"year": 121
},
"XK_DATE": "2023-11-04",
"QF_MANAGER_NAME": "广东省药品监督管理局",
"BUSINESS_LICENSE_NUMBER": "91442000MA51EQLWXE",
"XC_DATE": "2021-02-09",
"NUM_": 10
},
{
"ID": "3e18d72df9c24f0b8f15e76d0fbcc913",
"EPS_NAME": "东莞市百丽达生物科技有限公司",
"PRODUCT_SN": "粤妆20170591",
"CITY_CODE": null,
"XK_COMPLETE_DATE": {
"date": 9,
"day": 2,
"hours": 0,
"minutes": 0,
"month": 1,
"nanos": 0,
"seconds": 0,
"time": 1612800000000,
"timezoneOffset": -480,
"year": 121
},
"XK_DATE": "2022-11-30",
"QF_MANAGER_NAME": "广东省药品监督管理局",
"BUSINESS_LICENSE_NUMBER": "91441900MA4WHBJX7T",
"XC_DATE": "2021-02-09",
"NUM_": 11
},
{
"ID": "889f31556d0149b6ac1be15b09f35b80",
"EPS_NAME": "深圳市创亿鑫生物科技有限公司",
"PRODUCT_SN": "粤妆20160938",
"CITY_CODE": null,
"XK_COMPLETE_DATE": {
"date": 9,
"day": 2,
"hours": 0,
"minutes": 0,
"month": 1,
"nanos": 0,
"seconds": 0,
"time": 1612800000000,
"timezoneOffset": -480,
"year": 121
},
"XK_DATE": "2026-02-08",
"QF_MANAGER_NAME": "广东省药品监督管理局",
"BUSINESS_LICENSE_NUMBER": "914403007230102536",
"XC_DATE": "2021-02-09",
"NUM_": 12
},
{
"ID": "a96e67d64344436cbc71ae72077894c4",
"EPS_NAME": "朝日化妆品(深圳)有限公司",
"PRODUCT_SN": "粤妆20160028",
"CITY_CODE": null,
"XK_COMPLETE_DATE": {
"date": 9,
"day": 2,
"hours": 0,
"minutes": 0,
"month": 1,
"nanos": 0,
"seconds": 0,
"time": 1612800000000,
"timezoneOffset": -480,
"year": 121
},
"XK_DATE": "2026-02-08",
"QF_MANAGER_NAME": "广东省药品监督管理局",
"BUSINESS_LICENSE_NUMBER": "91440300772747550X",
"XC_DATE": "2021-02-09",
"NUM_": 13
},
{
"ID": "fffd38b18a13480d8d872f0d28f71a53",
"EPS_NAME": "广州振业医药科技有限公司",
"PRODUCT_SN": "粤妆20210056",
"CITY_CODE": null,
"XK_COMPLETE_DATE": {
"date": 8,
"day": 1,
"hours": 0,
"minutes": 0,
"month": 1,
"nanos": 0,
"seconds": 0,
"time": 1612713600000,
"timezoneOffset": -480,
"year": 121
},
"XK_DATE": "2026-02-07",
"QF_MANAGER_NAME": "广东省药品监督管理局",
"BUSINESS_LICENSE_NUMBER": "91440101MA9UYENHXQ",
"XC_DATE": "2021-02-08",
"NUM_": 14
},
{
"ID": "89cf33bca751479281d1988e842d9ca1",
"EPS_NAME": "广州中燊医药科技有限公司",
"PRODUCT_SN": "粤妆20210055",
"CITY_CODE": null,
"XK_COMPLETE_DATE": {
"date": 8,
"day": 1,
"hours": 0,
"minutes": 0,
"month": 1,
"nanos": 0,
"seconds": 0,
"time": 1612713600000,
"timezoneOffset": -480,
"year": 121
},
"XK_DATE": "2026-02-07",
"QF_MANAGER_NAME": "广东省药品监督管理局",
"BUSINESS_LICENSE_NUMBER": "91440101MA5D0HBF7L",
"XC_DATE": "2021-02-08",
"NUM_": 15
}
],
"orderBy": "createDate",
"orderType": "desc",
"pageCount": 369,
"pageNumber": 1,
"pageSize": 15,
"property": "",
"totalCount": 5534
}
经过进一步的分析可发现其中并未包含详情页面所对应的url
仔细观察发现规律
其中第一个详情页得url为
http://scxk.nmpa.gov.cn:81/xk/itownet/portal/dzpz.jsp?id=c3854166c00f46b5b29fe2a55d3df929
而第二个页面为
http://scxk.nmpa.gov.cn:81/xk/itownet/portal/dzpz.jsp?id=c3ae6ecabbcf4ba68038321b80819753
经过分析可发现只有后面id=的部分发生变化因此从json中查找发现
"ID": "c3854166c00f46b5b29fe2a55d3df929",
"EPS_NAME": "广东彤姿生物科技有限公司",
"PRODUCT_SN": "粤妆20210037",
"CITY_CODE": null,
"XK_COMPLETE_DATE": {
id参数和url中对应的id参数相同
因此可通过爬取的首页的id参数和域名进行拼接从而获得详情页面的url
因上次的经验可得 详情页的数据是否也是动态加载出来的
经过验证可发现确实包含在一个post请求中
对详情页的ajax请求进行分析发现
第一个为:http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById
第二个为:http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById
因此得知每个ajax请求所对应的url相同
再继续分析得知所携带的id参数不同
因此我们可设计讲每个页面的id爬取并和固定url结合发送
代码环节
import json
import requests
url = "http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList" # 批量获取id
head = {
# 模拟浏览器头部信息
"User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64;rv: 85.0) Gecko / 20100101Firefox / 85.0" # 伪装成浏览器访问
}
page1 = int(input("请输入总页数"))#循环获取数据
id_list = []
all_datalist = [] # 存储所有data数据
for page in range(1,page1):
data = {
# 参数封装
"on": "true",
"page": page,
"pageSize": "15",
"productName": "",
"conditionType": "1",
"applyname": "",
"applysn": ""
}
response_json = requests.post(url=url, headers=head, data=data).json()
for dic in response_json['list']: # 对id值进行遍历
id_list.append(dic['ID'])
post_url = "http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById"
for id in id_list:
post_data = {
# 遍历id参数
"id": id
}
detail_json = requests.post(url=post_url, data=post_data, headers=head).json()
all_datalist.append(detail_json)
fp = open('./alldata.json', 'w', encoding='utf-8') #数据存储
json.dump(all_datalist, fp=fp, ensure_ascii=False)
print("over!")