python抓取百度百科

python抓取百度百科结构化信息

import pymysql
import re
import requests
from lxml import html
import xlwt,xlrd
def baidubaike(name):
    baseurl='https://baike.baidu.com/item/'
    headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
    url=baseurl+str(name)

    response=requests.get(url,headers=headers)
    print(response.status_code)
    print(response.url)
    text=response.content.decode("utf-8").replace('\n','')
    text = text.replace('', '')
    text=re.sub(r'()', '', text)
    text = text.replace('
','、') text = re.sub(r'()', '', text) text = text.replace('', '、') text = re.sub(r'()', '', text) text = text.replace('', '、') text = text.replace('', '') text = text.replace('', '') tree=html.fromstring(text) result0=tree.xpath('//dt[@class="basicInfo-item name"]/text()') result00 = tree.xpath('//dd[@class="basicInfo-item value"]/text()') result1=[i.replace('\xa0','') for i in result0] result11 = [i.replace('\xa0', '') for i in result00] if(len(result1)!=len(result11)): print(name,"出现了一个错误") pass else: s={} for i in range(len(result1)): s[result1[i]]=result11[i] return s aa=baidubaike("刘诗诗") # print((aa[0]),'\n',aa[1],'\n',aa[2],'\n',aa[3],'\n',aa[4]) print(aa)

python从excel读取数据并将抓取到的数据存入excel

import pymysql
import re
import requests
from lxml import html

import xlwt,xlrd
def baidubaike(name):
    baseurl='https://baike.baidu.com/item/'
    headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
    url=baseurl+str(name)

    response=requests.get(url,headers=headers)
    print(response.status_code)
    print(response.url)
    text=response.content.decode("utf-8").replace('\n','')
    text = text.replace('', '')
    text=re.sub(r'()', '', text)
    text = text.replace('
','、') text = re.sub(r'()', '', text) text = text.replace('', '、') text = re.sub(r'()', '', text) text = text.replace('', '、') text = text.replace('', '') text = text.replace('', '') tree=html.fromstring(text) result0=tree.xpath('//dt[@class="basicInfo-item name"]/text()') result00 = tree.xpath('//dd[@class="basicInfo-item value"]/text()') result1=[i.replace('\xa0','') for i in result0] result11 = [i.replace('\xa0', '') for i in result00] if(len(result1)!=len(result11)): print(name,"出现了一个错误") pass else: s={} for i in range(len(result1)): s[result1[i]]=result11[i] return s """ 读取excel表格 """ readbook = xlrd.open_workbook('C:\\Users\\root\\Desktop\\6.xls') sheet = readbook.sheet_by_index(0) data=sheet.col_values(0) headlist=[] for i in range(len(data)): print(data[i]) a = baidubaike(data[i]).keys() print(a) headlist=list(set(headlist+list(a))) print(headlist) print(1234) """ 将数据写入 """ workbook = xlwt.Workbook(encoding = 'utf-8') # 第2步:创建一个worksheet worksheet = workbook.add_sheet('My Worksheet') # 第3步:写入excel # 参数对应 行, 列, 值 for k in range(len(headlist)): worksheet.write(0, k, headlist[k]) for i in range(len(data)): aa=baidubaike(data[i]) bb = list(aa.keys()) cc= list(aa.values()) for j in range(len(bb)): if(bb[j] in headlist): indexkey=headlist.index(bb[j]) print(indexkey) print(headlist[indexkey]) worksheet.write(i+1,indexkey,aa[headlist[indexkey]]) else: pass # 第4步:保存(一定记得保存) workbook.save('C:\\Users\\root\\Desktop\\ls.xls')

你可能感兴趣的:(python抓取百度百科)