小木虫SCI的ISSN数据


小白一枚,爬虫还不熟悉,本文学习了文章https://blog.csdn.net/qubeijun/article/details/89163046?ops_request_misc=%257B%2522request%255Fid%25,有些地方还存在问题,留给学习记录!

本次学习收获,将结果导出到excel.

仍有3点没有解决:

1、第一页数据没有成功获取,--已找到原因,39、40行代码的wrapper后面的数字从8改为7即可

2、指定期刊数据还没找到错误原因,

3、结果里英文期刊的名称不能正确显示。

 

import pandas as pd
import requests
from lxml import html

from lxml import etree
import xlwt
 
conn = requests.session()
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) '
                         'AppleWebKit/537.36 (KHTML, like Gecko) '
                         'Chrome/63.0.3239.26 '
                         'Safari/537.36 Core/1.63.5733.400 '
                         'QQBrowser/10.2.2019.400'}
 
 
# 验证码计算
def cal(sen):
    number = sen.split(':')[1].split('等于')[0]
    ans = 0
    if '加' in sen:
        ans = int(number.split('加')[0]) + int(number.split('加')[1])
    elif '减' in sen:
        ans = int(number.split('减')[0]) - int(number.split('减')[1])
    elif '乘以' in sen:
        ans = int(number.split('乘以')[0]) * int(number.split('乘以')[1])
    elif '除以' in sen:
        ans = int(number.split('除以')[0]) / int(number.split('除以')[1])
    return int(ans)
 
 
# SCI期刊
def all_journal():
  
    # 期刊
    # 第一页
    url = 'http://muchong.com/bbs/journal.php'
    rep2 = conn.get(url, headers=headers)
    qikan = html.fromstring(rep2.text)
    head_name = qikan.xpath('//div[@class="wrapper"][7]/div[@class="forum_head"]//td/text()')
    all_qikan = qikan.xpath('//div[@class="wrapper"][7]/div[@class="forum_body"]//tbody')
    print(head_name)  #显示标题
    xx = []
    df = []
    xsplit = []
    for a in all_qikan[:]:
        x = a.xpath('string(.)')
      # print(x.split())
        xsplit = x.split() 
        xx.append(xsplit) #列表
       # print(df.T) # list转置,不需要了
     
       
        
    # 多行注释的格式
    # 第一页往后
    for i in range(2, 203):
        url = 'http://muchong.com/bbs/journal.php?from=emuch&view=&classid=0&class_credit=0&page=' + str(i)
        rep2 = conn.get(url, headers=headers)
        qikan = html.fromstring(rep2.text)
        head_name = qikan.xpath('//div[@class="wrapper"][6]/div[@class="forum_head"]//td/text()')
        all_qikan = qikan.xpath('//div[@class="wrapper"][6]/div[@class="forum_body"]//tbody')
        for a in all_qikan[:]:
            x = a.xpath('string(.)')
            # print(x.split())
            xsplit = x.split()
            xx.append(xsplit)
    df = pd.DataFrame(xx)  
    saveData(df)  
 
def journal_name(name):#SCI数据取时有问题
    url = 'http://muchong.com/bbs/journal.php'
    name = name.encode("GBK")
    postdata = {
        'issn': '',
        'tagname': '',
        'name': name,
        'ssubmit': '(unable to decode value)',
        'accept-charset': "utf-8"
    }
    rep = conn.post(url, data=postdata, headers=headers)
    qikan = html.fromstring(rep.text)
    every_qikan = qikan.xpath('//div[@class="wrapper"][6]/div[@class="forum_body"]//tbody')
    for a in every_qikan[:]:
        x1 = a.xpath('tr/th/a/@href')
        url = 'http://muchong.com/bbs/'+x1[0]
        print(url)
        detail(url)
 
 
def detail(url):
    rep = conn.get(url, headers=headers)
    _detail = html.fromstring(rep.text)
    # 虫友提供资料
    deta = _detail.xpath('//div[@class="wrapper"][4]/div[@class="forum_explan bg_global"][1]//tr')
    for i in deta:
        i1 = i.xpath('string(.)')
        print(i1.split())
 
 
def saveData(data):
    data.to_excel('usci.xlsx', index=False)
    
    
print('----------指定期刊----------')
# journal_name('CELL') # 有问题
print('----------所有期刊----------')
all_journal()

小木虫SCI的ISSN数据_第1张图片

你可能感兴趣的:(#,python,python)