小白一枚,爬虫还不熟悉,本文学习了文章https://blog.csdn.net/qubeijun/article/details/89163046?ops_request_misc=%257B%2522request%255Fid%25,有些地方还存在问题,留给学习记录!
本次学习收获,将结果导出到excel.
仍有3点没有解决:
1、第一页数据没有成功获取,--已找到原因,39、40行代码的wrapper后面的数字从8改为7即可
2、指定期刊数据还没找到错误原因,
3、结果里英文期刊的名称不能正确显示。
import pandas as pd
import requests
from lxml import html
from lxml import etree
import xlwt
conn = requests.session()
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/63.0.3239.26 '
'Safari/537.36 Core/1.63.5733.400 '
'QQBrowser/10.2.2019.400'}
# 验证码计算
def cal(sen):
number = sen.split(':')[1].split('等于')[0]
ans = 0
if '加' in sen:
ans = int(number.split('加')[0]) + int(number.split('加')[1])
elif '减' in sen:
ans = int(number.split('减')[0]) - int(number.split('减')[1])
elif '乘以' in sen:
ans = int(number.split('乘以')[0]) * int(number.split('乘以')[1])
elif '除以' in sen:
ans = int(number.split('除以')[0]) / int(number.split('除以')[1])
return int(ans)
# SCI期刊
def all_journal():
# 期刊
# 第一页
url = 'http://muchong.com/bbs/journal.php'
rep2 = conn.get(url, headers=headers)
qikan = html.fromstring(rep2.text)
head_name = qikan.xpath('//div[@class="wrapper"][7]/div[@class="forum_head"]//td/text()')
all_qikan = qikan.xpath('//div[@class="wrapper"][7]/div[@class="forum_body"]//tbody')
print(head_name) #显示标题
xx = []
df = []
xsplit = []
for a in all_qikan[:]:
x = a.xpath('string(.)')
# print(x.split())
xsplit = x.split()
xx.append(xsplit) #列表
# print(df.T) # list转置,不需要了
# 多行注释的格式
# 第一页往后
for i in range(2, 203):
url = 'http://muchong.com/bbs/journal.php?from=emuch&view=&classid=0&class_credit=0&page=' + str(i)
rep2 = conn.get(url, headers=headers)
qikan = html.fromstring(rep2.text)
head_name = qikan.xpath('//div[@class="wrapper"][6]/div[@class="forum_head"]//td/text()')
all_qikan = qikan.xpath('//div[@class="wrapper"][6]/div[@class="forum_body"]//tbody')
for a in all_qikan[:]:
x = a.xpath('string(.)')
# print(x.split())
xsplit = x.split()
xx.append(xsplit)
df = pd.DataFrame(xx)
saveData(df)
def journal_name(name):#SCI数据取时有问题
url = 'http://muchong.com/bbs/journal.php'
name = name.encode("GBK")
postdata = {
'issn': '',
'tagname': '',
'name': name,
'ssubmit': '(unable to decode value)',
'accept-charset': "utf-8"
}
rep = conn.post(url, data=postdata, headers=headers)
qikan = html.fromstring(rep.text)
every_qikan = qikan.xpath('//div[@class="wrapper"][6]/div[@class="forum_body"]//tbody')
for a in every_qikan[:]:
x1 = a.xpath('tr/th/a/@href')
url = 'http://muchong.com/bbs/'+x1[0]
print(url)
detail(url)
def detail(url):
rep = conn.get(url, headers=headers)
_detail = html.fromstring(rep.text)
# 虫友提供资料
deta = _detail.xpath('//div[@class="wrapper"][4]/div[@class="forum_explan bg_global"][1]//tr')
for i in deta:
i1 = i.xpath('string(.)')
print(i1.split())
def saveData(data):
data.to_excel('usci.xlsx', index=False)
print('----------指定期刊----------')
# journal_name('CELL') # 有问题
print('----------所有期刊----------')
all_journal()