本学期数据仓库与数据挖掘课程大作业是编程实现一种数据挖掘方法,之前也注意过学校图书馆的荐购系统,数据十分有趣,于是想借这次机会尝试一下。
在学校内网时一般用IP地址访问,在校外学校为我们提供了VPN服务,而且内网VPN支持自动登录并且不限时间,爬取之前我想,将含有我校园VPN登录信息的cookie放进请求头应该就能实现模拟登陆,后面事实证明也是如此。
数据来源主要是以下两个页面:
通过网页检查,可以得到url、cookie、用于定位的xpath语句。
此部分注意以下几点
用到的第三方库:
获取网页返回数据的代码:
# page用于网页翻页
page = 1
def get_response(page):
request = ur.Request(
url='https://202-204-70-2-8080.web.ncepu.edu.cn/asord/asord_hist.php?page=' + str(page),
# 请求头里包含了user-agent和cookie,user-agent会随机改变来进行header伪
# 装,cookie包含了我的登录信息
headers={
'User-Agent': user_agent.get_user_agent_pc(),
'Cookie': 'Ecp_ClientId=2200316134701955091; UM_distinctid=170e1e4017c2f1-0368182f8cae1e-366b400c-100200-170e1e4017d5f8; s_ecid=MCMID%7C65840636796574064942530628684224841055; sp=039e7b5d-932a-4634-b129-b32c9e1f4715; _hjid=f9a5b480-7aa6-4cfa-86b0-9e680bf8c2f1; AMCV_8E929CC25A1FB2B30A495C97%40AdobeOrg=281789898%7CMCIDTS%7C18341%7CMCMID%7C65840636796574064942530628684224841055%7CMCAAMLH-1585218224%7C11%7CMCAAMB-1585218224%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1584620624s%7CNONE%7CMCSYNCSOP%7C411-18345%7CMCAID%7CNONE%7CvVersion%7C4.1.0; AMCV_8E929CC25A1FB2B30A495C97%40AdobeOrg=281789898%7CMCIDTS%7C18341%7CMCMID%7C65840636796574064942530628684224841055%7CMCAAMLH-1585218224%7C11%7CMCAAMB-1585218224%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1584620624s%7CNONE%7CMCSYNCSOP%7C411-18345%7CMCAID%7CNONE%7CvVersion%7C4.1.0; __cfduid=d9533956e470db8ed7280690d1525b0b81585644698; EUID=8a3376c8-37ef-4577-b0f9-ca8b8067f787; ANONRA_COOKIE=18DED70A2D59F142CC3142606041D2E1DF9A64D2F2FAC2F3624B8159BB7F9E869A06BAB46E67AAC399FAC5A25ABB2F39ECB1B67F767FFD3E; SD_REMOTEACCESS=eyJhY2NvdW50SWQiOiI2MDM0MCIsImRlcHRJZCI6Ijg1NzA1IiwidGltZXN0YW1wIjoxNTg1NjQ0Njk4NzAwfQ==; AMCV_4D6368F454EC41940A4C98A6%40AdobeOrg=-432600572%7CMCMID%7C65840636796574064942530628684224841055%7CMCIDTS%7C18353%7CMCAID%7CNONE%7CMCOPTOUT-1585651904s%7CNONE%7CMCAAMLH-1586249504%7C11%7CMCAAMB-1586249504%7Cj8Odv6LonN4r3an7LhD3WZrU1bUpAkFkkiY1ncBR96t2PTI%7CMCCIDH%7C-1439492996%7CvVersion%7C4.5.2; s_pers=%20v8%3D1585649115885%7C1680257115885%3B%20v8_s%3DLess%2520than%25201%2520day%7C1585650915885%3B; AMCV_4D6368F454EC41940A4C98A6%40AdobeOrg=-432600572%7CMCMID%7C65840636796574064942530628684224841055%7CMCIDTS%7C18353%7CMCAID%7CNONE%7CMCOPTOUT-1585651904s%7CNONE%7CMCAAMLH-1586249504%7C11%7CMCAAMB-1586249504%7Cj8Odv6LonN4r3an7LhD3WZrU1bUpAkFkkiY1ncBR96t2PTI%7CMCCIDH%7C-1439492996%7CvVersion%7C4.5.2; utag_main=v_id:0170e1ef156700210b7b2560e5140306e00430660086e$_sn:12$_ss:0$_st:1585723672701$vapi_domain:ncepu.edu.cn$_se:2$ses_id:1585721103283%3Bexp-session$_pn:6%3Bexp-session; id=2289d9138cc10048||t=1585721874|et=730|cs=002213fd48ee63d482ebb634c4; SEARCHHISTORY_0=UEsDBBQACAgIANSrilAAAAAAAAAAAAAAAAABAAAAMM2R20vbUBzH%2F5cDp0%2Bltbm1KZSRri0IRdks%0AilgfzpqzNJAmIReiG0IFL6UgWLzBWrah2OKDRdAxFS9%2FjEmM%2F4UnraKIPrinPZ3P7%2FyufL8z34GF%0Avih4DNUwSKu2okSBLII0%2BDxuF6YVfZLmKBAFtomNUfGxwMTIqFRL8zppSZCkoZCGqmXp6XhcijlI%0A%2FYpUSUQWilW0WqyixocND09RNq2YqH14mpJBihIxq5ozCCI6kvCE%2FI3QsGRKM8QMzKegQMMsE0KW%0AhXwW5pOQZ2A2GZHNkiFLEjZKSMqQa%2FGcbmDTlDWV3BXsnQR%2F2rebf0nCGpwM%2FM4vr9G8qS96h133%0Aco2Ae%2FEj6PdDON3wGgcEbnstv1n3m%2Fteq%2FkUNtb9nx0S%2Br9XvdWVEI63gt5SOKpz5G%2Bfedc7hIPe%0Ayt1uK%2FxsX7lnXbJYtWsFzVbFgV4VAyMLl%2BRQ8QSb4liK4fkRmqMXom%2B5UeSTn4SCqtP0%2B9xwHOef%0A%2FXiuPZE5B%2FmPofapEZhiIS1AioJ5Dgo5KHCDAgHyiQcQ8iT70gj3shPUl9Nl4BPVz1vueb8M%2FhNP%0AKO41V1iGSRDBF2bvAVBLBwhCn1%2FOtgEAACUDAAA%3D%0A; _abck=1EFC414F10BC1D6A0927D3B24DBD4FDA~0~YAAQV5bfF62bxGRxAQAAOCgKaAPE18idV2UZExKBviPP6NVRMA9LSK7b5ISYpILS/X8gcGoE3NpZ3a1lnXnJdkqBNQtNsUv8RiDXzC3mreJMFADLWmvxY6TQrXAtoQKssC/Refr8T49sbEW24nNBf55iwyF/jjU1WPK07aFiZzu8MTJvPMR1RIloTSUNDAv4/YTJk6vyaidJjayBWjaGX5YRnqi0dH1NndKmocg1PsI26QrSTksxt5fcVRNbuIeMJkHF4rLLTO+zoMRO6kky/DvcpDtS2UdC8pbWbn34yeVUOR6iz/oOfSADvevj7TXTcLYso37IMJoYNkmV1pSi~-1~-1~-1; _ga=GA1.3.1681601817.1586589033; _sp_id.e0ee=ed2403cb-a7e2-4458-a8b9-7c47726e3826.1584340234.3.1586591633.1586524290.3c720d37-18bb-47ca-b586-62e73dd703b3; PHPSESSID=944815ubc3eb1kosegdk1cjie2; web_username=120171080101%7C1586959936%7C1447df2e037c70f1691256345e6162d4f3b7660d; _astraeus_session=aThBa3A3RWFha2pCK1VWRU95dGh5UWJmd2pQbzdPdnlPRzNSNE9CcHp0bWcwNkZ2Qm12UXF6SU5SWEtHRTBERmpqQjFIK2o4eitPQkRMcTdjOUhaR1psSi9IMHF0ekRXR2ZCV00zSDlNWWMraXE3MlRGWDQ4QnhQcVlrWWZuZUxOR1B1OHpNVEIvYzU1RE8zeFFpYjZld2VneUFtVmdhZFg4bVBVYU5ydEdxMi9UTkdabGZuQkNFMWtCVDNRbERPR09MaWFaUWdLemxYODV5MUVidW9OQUFVZERVU1pGKzdSTnIyaHVLeDdpT2dxMSsrTzJVZklRVTU0dHZSVTA0LytQdEhOUlhWQVZlKyt4ZXFhRkI0NkZqWnJTbk8vVWFha1QzY2U2NHBxdkViOXordXNobVdSNk9YdXJmUWpkb25HUm1kMG11emd0UUZlTWFQbWUrQ1BFKzlFV0drQTh5cDBSN1JnVWJpYVBsZmVlWjBTcnJpVmRSbFlvNG9SUzlJLS10aWlTOTF5RG1kWWJ5YjlhZmFaM0Z3PT0%3D--ecb7db64ac12526389f83b88e890f1f910c073a6'
}
)
response = ur.urlopen(request).read().decode('utf-8')
# print(type(response))
lxml_x = le.HTML(response)
return lxml_x
上面用到的user_agent.get_user_agent_pc()方法,获取随机的UserAgent进行header伪装
import random
# pc端的user-agent
user_agent_pc = [
# 谷歌
'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.html.2171.71 Safari/537.36',
'Mozilla/5.0.html (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.html.1271.64 Safari/537.11',
'Mozilla/5.0.html (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.html.648.133 Safari/534.16',
# 火狐
'Mozilla/5.0.html (Windows NT 6.1; WOW64; rv:34.0.html) Gecko/20100101 Firefox/34.0.html',
'Mozilla/5.0.html (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
# opera
'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.html.2171.95 Safari/537.36 OPR/26.0.html.1656.60',
# qq浏览器
'Mozilla/5.0.html (compatible; MSIE 9.0.html; Windows NT 6.1; WOW64; Trident/5.0.html; SLCC2; .NET CLR 2.0.html.50727; .NET CLR 3.5.30729; .NET CLR 3.0.html.30729; Media Center PC 6.0.html; .NET4.0C; .NET4.0E; QQBrowser/7.0.html.3698.400)',
# 搜狗浏览器
'Mozilla/5.0.html (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.html.963.84 Safari/535.11 SE 2.X MetaSr 1.0.html',
# 360浏览器
'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.html.1599.101 Safari/537.36',
'Mozilla/5.0.html (Windows NT 6.1; WOW64; Trident/7.0.html; rv:11.0.html) like Gecko',
# uc浏览器
'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.html.2125.122 UBrowser/4.0.html.3214.0.html Safari/537.36',
]
# 移动端的user-agent
user_agent_phone = [
# IPhone
'Mozilla/5.0.html (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.html.2 Mobile/8J2 Safari/6533.18.5',
# IPAD
'Mozilla/5.0.html (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.html.2 Mobile/8C148 Safari/6533.18.5',
'Mozilla/5.0.html (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.html.2 Mobile/8J2 Safari/6533.18.5',
# Android
'Mozilla/5.0.html (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0.html Mobile Safari/533.1',
'Mozilla/5.0.html (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0.html Mobile Safari/533.1',
# QQ浏览器 Android版本
'MQQBrowser/26 Mozilla/5.0.html (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0.html Mobile Safari/533.1',
# Android Opera Mobile
'Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10',
# Android Pad Moto Xoom
'Mozilla/5.0.html (Linux; U; Android 3.0.html; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0.html Safari/534.13',
]
def get_user_agent_pc():
return random.choice(user_agent_pc)
def get_user_agent_phone():
return random.choice(user_agent_phone)
我们通过get_response()函数已经可以得到网页返回的结果,现在需要筛选我们感兴趣的数据:书名、作者、荐购日期、图书馆购买情况、图书馆的反馈:
n = 1
try:
for page in range(1, 33):#32页
response_ = get_response(page)
for i in range(2, 22):#每页20组数据
#xpath语句定位
title = response_.xpath('//*[@id="container"]/table/tr[' + str(i) + ']/td[@class="whitetext"][2]/text()')[0]
author = response_.xpath('//*[@id="container"]/table/tr[' + str(i) + ']/td[@class="whitetext"][3]/text()')[
0]
date = response_.xpath('//*[@id="container"]/table/tr[' + str(i) + ']/td[@class="whitetext"][5]/text()')[0]
status = response_.xpath('//*[@id="container"]/table/tr[' + str(i) + ']/td[@class="whitetext"][6]/text()')[
0]
note = response_.xpath('//*[@id="container"]/table/tr[' + str(i) + ']/td[@class="whitetext"][7]/text()')
#注意到note和status列表里含有空数据,这里进行替换
if note:
pass
else:
note = '无'
if len(status) == 2:
status = response_.xpath('//*[@id="container"]/table/tr[' + str(i) + ']/td[@class="whitetext"][6]/font/text()')[0]
#这里用到了Excel处理,它的预定义下面会讲
mysheet.write(n, 0, title)
mysheet.write(n, 1, author)
mysheet.write(n, 2, date)
mysheet.write(n, 3, status)
mysheet.write(n, 4, note)
n += 1
# print(title, author, date, status)
print(n)
except:
pass
在上面的语句块之前,我们需要定义mysheet变量,它来自于xlwt库中的xlwt.Workbook.add_sheet()方法,用于Excel表格的存取。
workbook = xlwt.Workbook(encoding='utf-8')
mysheet = workbook.add_sheet('荐购数据', cell_overwrite_ok=True)
header = ['书名', '作者', '荐购日期', '状态', '备注']
for i in range(0, 5):
mysheet.write(0, i, header[i])
最后,本页面爬取完成,数据存入Excel表格中。
workbook.save('荐购表.xls')
print('已导出到Excel表格!')
我们来运行一下:
共用时19秒就爬取完此网站上的632组数据,我们查看根目录会发现多出来的“荐购表.xls"文件,打开后如图
这样,632条荐购数据已经存入了本地,我们接着进行下一步数据爬取。
用到的第三方库:
关键代码:
# 按照“书名+作者”的形式搜索
for i in range(1, 629):
title = table.cell(i, 0).value
author = table.cell(i, 1).value
search_word.append(title + ' ' + author)
for j in search_word:
form_data = {"searchWords": [{"fieldList": [{"fieldCode": "", "fieldValue": j}]}], "filters": [],
"limiter": [], "sortField": "relevance", "sortType": "desc", "pageSize": 20, "pageCount": 1,
"locale": "zh_CN", "first": True}
url = 'https://202-204-70-2-8080.web.ncepu.edu.cn/opac/ajax_search_adv.php'
headers = {
'User-Agent': user_agent.get_user_agent_pc(),
'Cookie': 'Ecp_ClientId=2200316134701955091; UM_distinctid=170e1e4017c2f1-0368182f8cae1e-366b400c-100200-170e1e4017d5f8; s_ecid=MCMID%7C65840636796574064942530628684224841055; sp=039e7b5d-932a-4634-b129-b32c9e1f4715; _hjid=f9a5b480-7aa6-4cfa-86b0-9e680bf8c2f1; AMCV_8E929CC25A1FB2B30A495C97%40AdobeOrg=281789898%7CMCIDTS%7C18341%7CMCMID%7C65840636796574064942530628684224841055%7CMCAAMLH-1585218224%7C11%7CMCAAMB-1585218224%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1584620624s%7CNONE%7CMCSYNCSOP%7C411-18345%7CMCAID%7CNONE%7CvVersion%7C4.1.0; AMCV_8E929CC25A1FB2B30A495C97%40AdobeOrg=281789898%7CMCIDTS%7C18341%7CMCMID%7C65840636796574064942530628684224841055%7CMCAAMLH-1585218224%7C11%7CMCAAMB-1585218224%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1584620624s%7CNONE%7CMCSYNCSOP%7C411-18345%7CMCAID%7CNONE%7CvVersion%7C4.1.0; __cfduid=d9533956e470db8ed7280690d1525b0b81585644698; EUID=8a3376c8-37ef-4577-b0f9-ca8b8067f787; ANONRA_COOKIE=18DED70A2D59F142CC3142606041D2E1DF9A64D2F2FAC2F3624B8159BB7F9E869A06BAB46E67AAC399FAC5A25ABB2F39ECB1B67F767FFD3E; SD_REMOTEACCESS=eyJhY2NvdW50SWQiOiI2MDM0MCIsImRlcHRJZCI6Ijg1NzA1IiwidGltZXN0YW1wIjoxNTg1NjQ0Njk4NzAwfQ==; AMCV_4D6368F454EC41940A4C98A6%40AdobeOrg=-432600572%7CMCMID%7C65840636796574064942530628684224841055%7CMCIDTS%7C18353%7CMCAID%7CNONE%7CMCOPTOUT-1585651904s%7CNONE%7CMCAAMLH-1586249504%7C11%7CMCAAMB-1586249504%7Cj8Odv6LonN4r3an7LhD3WZrU1bUpAkFkkiY1ncBR96t2PTI%7CMCCIDH%7C-1439492996%7CvVersion%7C4.5.2; s_pers=%20v8%3D1585649115885%7C1680257115885%3B%20v8_s%3DLess%2520than%25201%2520day%7C1585650915885%3B; AMCV_4D6368F454EC41940A4C98A6%40AdobeOrg=-432600572%7CMCMID%7C65840636796574064942530628684224841055%7CMCIDTS%7C18353%7CMCAID%7CNONE%7CMCOPTOUT-1585651904s%7CNONE%7CMCAAMLH-1586249504%7C11%7CMCAAMB-1586249504%7Cj8Odv6LonN4r3an7LhD3WZrU1bUpAkFkkiY1ncBR96t2PTI%7CMCCIDH%7C-1439492996%7CvVersion%7C4.5.2; utag_main=v_id:0170e1ef156700210b7b2560e5140306e00430660086e$_sn:12$_ss:0$_st:1585723672701$vapi_domain:ncepu.edu.cn$_se:2$ses_id:1585721103283%3Bexp-session$_pn:6%3Bexp-session; id=2289d9138cc10048||t=1585721874|et=730|cs=002213fd48ee63d482ebb634c4; SEARCHHISTORY_0=UEsDBBQACAgIANSrilAAAAAAAAAAAAAAAAABAAAAMM2R20vbUBzH%2F5cDp0%2Bltbm1KZSRri0IRdks%0AilgfzpqzNJAmIReiG0IFL6UgWLzBWrah2OKDRdAxFS9%2FjEmM%2F4UnraKIPrinPZ3P7%2FyufL8z34GF%0Avih4DNUwSKu2okSBLII0%2BDxuF6YVfZLmKBAFtomNUfGxwMTIqFRL8zppSZCkoZCGqmXp6XhcijlI%0A%2FYpUSUQWilW0WqyixocND09RNq2YqH14mpJBihIxq5ozCCI6kvCE%2FI3QsGRKM8QMzKegQMMsE0KW%0AhXwW5pOQZ2A2GZHNkiFLEjZKSMqQa%2FGcbmDTlDWV3BXsnQR%2F2rebf0nCGpwM%2FM4vr9G8qS96h133%0Aco2Ae%2FEj6PdDON3wGgcEbnstv1n3m%2Fteq%2FkUNtb9nx0S%2Br9XvdWVEI63gt5SOKpz5G%2Bfedc7hIPe%0Ayt1uK%2FxsX7lnXbJYtWsFzVbFgV4VAyMLl%2BRQ8QSb4liK4fkRmqMXom%2B5UeSTn4SCqtP0%2B9xwHOef%0A%2FXiuPZE5B%2FmPofapEZhiIS1AioJ5Dgo5KHCDAgHyiQcQ8iT70gj3shPUl9Nl4BPVz1vueb8M%2FhNP%0AKO41V1iGSRDBF2bvAVBLBwhCn1%2FOtgEAACUDAAA%3D%0A; _abck=1EFC414F10BC1D6A0927D3B24DBD4FDA~0~YAAQV5bfF62bxGRxAQAAOCgKaAPE18idV2UZExKBviPP6NVRMA9LSK7b5ISYpILS/X8gcGoE3NpZ3a1lnXnJdkqBNQtNsUv8RiDXzC3mreJMFADLWmvxY6TQrXAtoQKssC/Refr8T49sbEW24nNBf55iwyF/jjU1WPK07aFiZzu8MTJvPMR1RIloTSUNDAv4/YTJk6vyaidJjayBWjaGX5YRnqi0dH1NndKmocg1PsI26QrSTksxt5fcVRNbuIeMJkHF4rLLTO+zoMRO6kky/DvcpDtS2UdC8pbWbn34yeVUOR6iz/oOfSADvevj7TXTcLYso37IMJoYNkmV1pSi~-1~-1~-1; _ga=GA1.3.1681601817.1586589033; _sp_id.e0ee=ed2403cb-a7e2-4458-a8b9-7c47726e3826.1584340234.3.1586591633.1586524290.3c720d37-18bb-47ca-b586-62e73dd703b3; web_username=120171080101%7C1587118484%7C8015f9ca9ec47da4dde6c91989324b8096061339; PHPSESSID=o2613oi8kgcn75eb83qlj3au22; _astraeus_session=ZW5ycnhkN2EwallDUWZ1VDUybzZONi9KUGx1K2N6ZSt2c0Y4QWN2ZXQvU0RPUXhOb2lheWQ2Tjk3ZjlMcUhOSGtiNGwvSUg4WjdNSDRNS1Z6VFlOT2hYTGlNNHQ3dkVBRUhndHhKRXJHLytrUlpIKzd2VHhZRkdNS0pQalhCeDNYdHJLY2tMbG1zTzJKYmwrQUg1dk1yc3lLUHpjeEtXOE0rUlR2ci9WU1Y4MFFoRUNkVHU0TkE0TmVTd2tUL3AyRlRnU1RTTEtPaFdoc0tKb3hoMFovcGpXYTNaRGZMQ20wWlB5Ri9NOG9xcExRdGVjSHpHczFtamlmclRPaXJjNzNidWZCMlpoZ1pTQTlUUnQ5cGN6U0dmdk8wa1ZERXFsVFNFeG55MjJrSWNIdE1wU3lzSXpJY04wTjAwNU9Fd2pDSzlmRjZNVDk1VStIWlFpczJJc0JLWGxLYktvYVZ3aDlEMHRud2xkbVBZN3BvaTcxZWZsdkJFdnZDZXpSRS9qLS01V3ljeVdncEo4QkJTYU5XdklhK2NRPT0%3D--a68f2f55af720ff9da2aba5a4f972f113985f799',
'Content-Type': 'application/json'
}
# 这里提交的是json格式的表单,需要用到json.dumps()函数进行转换
response = requests.post(url=url, headers=headers, data=json.dumps(form_data))
r = response.text
try:
marcRecNo = r[r.index("\"marcRecNo\":\"") + 13:r.index("\",\"num\":"):]
except Exception as e:
marcRecNo = ''
# 得到每本书唯一的marc码
marc_s.append(marcRecNo)
for marc in marc_s:
if marc=='':
continue
else:
request = ur.Request(
url='https://202-204-70-2-8080.web.ncepu.edu.cn/opac/item.php?marc_no='+marc,
headers={
'User-Agent': user_agent.get_user_agent_pc(),
'cookie': 'Ecp_ClientId=2200316134701955091; UM_distinctid=170e1e4017c2f1-0368182f8cae1e-366b400c-100200-170e1e4017d5f8; s_ecid=MCMID%7C65840636796574064942530628684224841055; sp=039e7b5d-932a-4634-b129-b32c9e1f4715; _hjid=f9a5b480-7aa6-4cfa-86b0-9e680bf8c2f1; AMCV_8E929CC25A1FB2B30A495C97%40AdobeOrg=281789898%7CMCIDTS%7C18341%7CMCMID%7C65840636796574064942530628684224841055%7CMCAAMLH-1585218224%7C11%7CMCAAMB-1585218224%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1584620624s%7CNONE%7CMCSYNCSOP%7C411-18345%7CMCAID%7CNONE%7CvVersion%7C4.1.0; AMCV_8E929CC25A1FB2B30A495C97%40AdobeOrg=281789898%7CMCIDTS%7C18341%7CMCMID%7C65840636796574064942530628684224841055%7CMCAAMLH-1585218224%7C11%7CMCAAMB-1585218224%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1584620624s%7CNONE%7CMCSYNCSOP%7C411-18345%7CMCAID%7CNONE%7CvVersion%7C4.1.0; __cfduid=d9533956e470db8ed7280690d1525b0b81585644698; EUID=8a3376c8-37ef-4577-b0f9-ca8b8067f787; ANONRA_COOKIE=18DED70A2D59F142CC3142606041D2E1DF9A64D2F2FAC2F3624B8159BB7F9E869A06BAB46E67AAC399FAC5A25ABB2F39ECB1B67F767FFD3E; SD_REMOTEACCESS=eyJhY2NvdW50SWQiOiI2MDM0MCIsImRlcHRJZCI6Ijg1NzA1IiwidGltZXN0YW1wIjoxNTg1NjQ0Njk4NzAwfQ==; AMCV_4D6368F454EC41940A4C98A6%40AdobeOrg=-432600572%7CMCMID%7C65840636796574064942530628684224841055%7CMCIDTS%7C18353%7CMCAID%7CNONE%7CMCOPTOUT-1585651904s%7CNONE%7CMCAAMLH-1586249504%7C11%7CMCAAMB-1586249504%7Cj8Odv6LonN4r3an7LhD3WZrU1bUpAkFkkiY1ncBR96t2PTI%7CMCCIDH%7C-1439492996%7CvVersion%7C4.5.2; s_pers=%20v8%3D1585649115885%7C1680257115885%3B%20v8_s%3DLess%2520than%25201%2520day%7C1585650915885%3B; AMCV_4D6368F454EC41940A4C98A6%40AdobeOrg=-432600572%7CMCMID%7C65840636796574064942530628684224841055%7CMCIDTS%7C18353%7CMCAID%7CNONE%7CMCOPTOUT-1585651904s%7CNONE%7CMCAAMLH-1586249504%7C11%7CMCAAMB-1586249504%7Cj8Odv6LonN4r3an7LhD3WZrU1bUpAkFkkiY1ncBR96t2PTI%7CMCCIDH%7C-1439492996%7CvVersion%7C4.5.2; utag_main=v_id:0170e1ef156700210b7b2560e5140306e00430660086e$_sn:12$_ss:0$_st:1585723672701$vapi_domain:ncepu.edu.cn$_se:2$ses_id:1585721103283%3Bexp-session$_pn:6%3Bexp-session; id=2289d9138cc10048||t=1585721874|et=730|cs=002213fd48ee63d482ebb634c4; SEARCHHISTORY_0=UEsDBBQACAgIANSrilAAAAAAAAAAAAAAAAABAAAAMM2R20vbUBzH%2F5cDp0%2Bltbm1KZSRri0IRdks%0AilgfzpqzNJAmIReiG0IFL6UgWLzBWrah2OKDRdAxFS9%2FjEmM%2F4UnraKIPrinPZ3P7%2FyufL8z34GF%0Avih4DNUwSKu2okSBLII0%2BDxuF6YVfZLmKBAFtomNUfGxwMTIqFRL8zppSZCkoZCGqmXp6XhcijlI%0A%2FYpUSUQWilW0WqyixocND09RNq2YqH14mpJBihIxq5ozCCI6kvCE%2FI3QsGRKM8QMzKegQMMsE0KW%0AhXwW5pOQZ2A2GZHNkiFLEjZKSMqQa%2FGcbmDTlDWV3BXsnQR%2F2rebf0nCGpwM%2FM4vr9G8qS96h133%0Aco2Ae%2FEj6PdDON3wGgcEbnstv1n3m%2Fteq%2FkUNtb9nx0S%2Br9XvdWVEI63gt5SOKpz5G%2Bfedc7hIPe%0Ayt1uK%2FxsX7lnXbJYtWsFzVbFgV4VAyMLl%2BRQ8QSb4liK4fkRmqMXom%2B5UeSTn4SCqtP0%2B9xwHOef%0A%2FXiuPZE5B%2FmPofapEZhiIS1AioJ5Dgo5KHCDAgHyiQcQ8iT70gj3shPUl9Nl4BPVz1vueb8M%2FhNP%0AKO41V1iGSRDBF2bvAVBLBwhCn1%2FOtgEAACUDAAA%3D%0A; _abck=1EFC414F10BC1D6A0927D3B24DBD4FDA~0~YAAQV5bfF62bxGRxAQAAOCgKaAPE18idV2UZExKBviPP6NVRMA9LSK7b5ISYpILS/X8gcGoE3NpZ3a1lnXnJdkqBNQtNsUv8RiDXzC3mreJMFADLWmvxY6TQrXAtoQKssC/Refr8T49sbEW24nNBf55iwyF/jjU1WPK07aFiZzu8MTJvPMR1RIloTSUNDAv4/YTJk6vyaidJjayBWjaGX5YRnqi0dH1NndKmocg1PsI26QrSTksxt5fcVRNbuIeMJkHF4rLLTO+zoMRO6kky/DvcpDtS2UdC8pbWbn34yeVUOR6iz/oOfSADvevj7TXTcLYso37IMJoYNkmV1pSi~-1~-1~-1; _ga=GA1.3.1681601817.1586589033; _sp_id.e0ee=ed2403cb-a7e2-4458-a8b9-7c47726e3826.1584340234.3.1586591633.1586524290.3c720d37-18bb-47ca-b586-62e73dd703b3; web_username=120171080101%7C1587118484%7C8015f9ca9ec47da4dde6c91989324b8096061339; PHPSESSID=o2613oi8kgcn75eb83qlj3au22; _astraeus_session=ZW5ycnhkN2EwallDUWZ1VDUybzZONi9KUGx1K2N6ZSt2c0Y4QWN2ZXQvU0RPUXhOb2lheWQ2Tjk3ZjlMcUhOSGtiNGwvSUg4WjdNSDRNS1Z6VFlOT2hYTGlNNHQ3dkVBRUhndHhKRXJHLytrUlpIKzd2VHhZRkdNS0pQalhCeDNYdHJLY2tMbG1zTzJKYmwrQUg1dk1yc3lLUHpjeEtXOE0rUlR2ci9WU1Y4MFFoRUNkVHU0TkE0TmVTd2tUL3AyRlRnU1RTTEtPaFdoc0tKb3hoMFovcGpXYTNaRGZMQ20wWlB5Ri9NOG9xcExRdGVjSHpHczFtamlmclRPaXJjNzNidWZCMlpoZ1pTQTlUUnQ5cGN6U0dmdk8wa1ZERXFsVFNFeG55MjJrSWNIdE1wU3lzSXpJY04wTjAwNU9Fd2pDSzlmRjZNVDk1VStIWlFpczJJc0JLWGxLYktvYVZ3aDlEMHRud2xkbVBZN3BvaTcxZWZsdkJFdnZDZXpSRS9qLS01V3ljeVdncEo4QkJTYU5XdklhK2NRPT0%3D--a68f2f55af720ff9da2aba5a4f972f113985f799'
}
)
response = ur.urlopen(request).read()
lxml_ = le.HTML(response)
def get_value(xpath):
return lxml_.xpath(xpath)
inf_s = get_value('//*[@id="item_detail"]/dl/descendant::*/text()')
del inf_s[-6:]
print(inf_s)
m = np.array(inf_s)
# 将得到的书籍信息存入本地
try:
np.save('书籍信息/%s.npy'%inf_s[1], m)
except:
pass