爬取中国知网搜索cnki.net

爬取中国知网搜索cnki.net_第1张图片

爬取cnki.net知网搜索页
 

import requests
from lxml.html import etree
url = 'https://kns.cnki.net/KNS8/Brief/GetGridTableHtml'
head = '''
Accept: text/html, */*; q=0.01
Accept-Encoding: gzip, deflate, br
Accept-Language: zh-CN,zh;q=0.9
Connection: keep-alive
Content-Length: 5091
Content-Type: application/x-www-form-urlencoded; charset=UTF-8
Cookie: RsPerPage=20; cnkiUserKey=cb04ea46-fcff-f318-6c7e-87f0ea44e355; Ecp_ClientId=7210125160700698453; x-s3-sid=>0a61D8patf86h5K0R0e3e200; SID_kns_new=kns123119; ASPSESSIONIDQCCSSQRB=EFJOBMNAMAOCIGHKBEHJBHDD; _pk_ses=*; Ecp_IpLoginFail=210312113.65.102.249; ASP.NET_SessionId=bvsbnlway43wcxl3dlxr2xap; SID_kns8=15123122; CurrSortField=%e5%8f%91%e8%a1%a8%e6%97%b6%e9%97%b4%2f(%e5%8f%91%e8%a1%a8%e6%97%b6%e9%97%b4%2c%27TIME%27); CurrSortFieldType=desc; Ecp_ClientIp=113.65.102.249; knsLeftGroupSelectItem=; _pk_id=0d44cb3d-1be9-4ae8-8c33-020571bf12b8.1611562020.2.1615534630.1615534473.
Host: kns.cnki.net
Origin: https://kns.cnki.net
Referer: https://kns.cnki.net/kns8/defaultresult/index
User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3861.400 QQBrowser/10.7.4313.400
X-Requested-With: XMLHttpRequest
'''
headers=dict([[y.strip() for y in x.strip().split(':',1)] for x in head.strip().split('\n')  if x.strip()])
head = '''
IsSearch: true
QueryJson: {"Platform":"","DBCode":"CJFQ","KuaKuCode":"","QNode":{"QGroup":[{"Key":"Subject","Title":"","Logic":1,"Items":[{"Title":"主题","Name":"SU","Value":"机器学习","Operate":"%=","BlurType":""}],"ChildItems":[]}]}}
SearchSql: $s
PageName: DefaultResult
HandlerId: 12
DBCode: CJFQ
KuaKuCodes: 
CurPage: $page
RecordsCntPerPage: 20
CurDisplayMode: listmode
CurrSortField: %e5%8f%91%e8%a1%a8%e6%97%b6%e9%97%b4%2f(%e5%8f%91%e8%a1%a8%e6%97%b6%e9%97%b4%2c%27TIME%27)
CurrSortFieldType: desc
IsSortSearch: false
IsSentenceSearch: false
Subject: 
'''
s = '2827E4B6502D8710F4C63FA68A0E7A152D8972E3EF5541A46DDE8B3A62A549C1B093F0A10875FEBDE5B17F4F918A6F10CAA2EA622595552DEE59C9627930D8AAE3535B937469326E18AA84865BAD3DC6C1A7DA64988D2A81F1A429AD691BEC3C662D26E88914ED474507E583978013FB39E843499D82596B2618CD689FA015E1201CDC81DB346AAE41EC83FB19ABED536E48A1FE92964DABD22B02545450128F5FB427A463A372F788658E5867448A0351AC6F24D95CC948DFB0CB37E8C1A23780A0564D73B49E40272C81D1506BDA00D384062293240AB28BD54BFFA2E9599004813294DAAE9DC557225BBFC50DD159F16711F5B687512F51B828095A324839500E1F7B01DC60B8F87DDBBA355CD5DF58C0DC01DD0CE799EBAD3EC607C080220B7AE750060B65FA18A17AB336C8EA69BFC6DC4366CF21E8992BD2BFA918302C112D362BB3D5C5AD468BB5A37378E54DBF69EBF67EC9A747B73BE2EC3ADB9E2E38331F0D075D377F29AA8EF6611E638AE84CDA26EE1466114FE039C01999CCB06313C2D0AF22E87E3C1779A9F9F49ECDED0EC05BE8322DF4E4C059352F6FC22118CC19825019AB5CE62F9A987D11EAB5BE3CFADAF7313CC689DD00B6C5DD6B36ECF8AC7C7AC53B7C8708F764B933919096EC05EEB875A6AD2879246FEB134F1FFDC1C9B1BB30D40B57E77ECED06161AFB9DC88141FC8E563573DF42087FF1BA4FD1614F79821C3D0BC486FEB503C440CAE60D9A1DB7224FD9E3C11D4BFA530A47E1961AC2D9E6545669E7B29833F236B41142CBDBE42ACEB83C00B2BBCEF2BC1285121A726B7BE54AF979751697F9AC2CEF84ED5F792B4B5CF09C47A0B633062D32CDA7FF026B2B5455D6CA9179444D57929D3655B365338348ECD3C52B82459FEC4B4AFBD590DB4CDF9ED5A735F752B5FA0CB949685AB70EB7B58D576228BD6338DAEC12486EBA5E4D6020541B84F003C851A08C6FC2663CB1238BEE953BAB5BFD43ADC95977D5CE71E8884322CADE077924B38AA03D510FF395991659B780F7727D9E6F9801978EEB4AEF614774C6E6EA8C71E3C83E571228AE3C6E6573D1100DA46515AB000F8C47705393A962BD5E543C13A5014DBDC84537F02AAC6F8BD8D93311EA6A28ED30F77CBED8CD989AD7AC5A3210C3F2C693E170845A065A7ABCC7CC218ECAC938E5CF8B0E6E279BFA707B43A4DD28B1D278837733DA6F440EA161F189D2B648E44F4C2DB0FD8FCB31E93F65E05309793E39CA113FDC8147429ECBB911B7802A001DEDF41D8602026E3E3092F4DEB722BE75EF81B9FFAA403119CF4B22C49268AC2270120103673D43DB78FACF1FDD290DC598BA79A54F08AB726A19FE8EA76DA836BDC4C243F0A816B4AAD507C4ADCE578'
#s是查询sql在html里面可以找到
page = 1
head = head.replace('$s', s)
head = head.replace('$page', str(page))
data=dict([[y.strip() for y in x.strip().split(':',1)] for x in head.strip().split('\n')  if x.strip()])
res = requests.post(url, timeout=30, data=data, verify=False, proxies=None, headers=headers).content.decode('utf-8')
xp = etree.HTML(res)
aa = xp.xpath('//table[@class="result-table-list"]//tr//td[@class="name"]//a')
for a in aa:
    print(a.xpath('string(.)'))



 

你可能感兴趣的:(python3,爬虫,python)