三,Lxml爬取
Excel存储:爬取中文起点小说
import xlwt
import requests
from lxml import etree
import time
all_info_list = []
def get_info(url):
html = requests.get(url)
selector = etree.HTML(html.text)
infos = selector.xpath('//ul[@class="all-img-list cf"]/li')
for info in infos:
title = info.xpath('div[2]/h4/a/text()')[0]
author = info.xpath('div[2]/p[1]/a[1]/text()')[0]
style_1 = info.xpath('div[2]/p[1]/a[2]/text()')[0]
style_2 = info.xpath('div[2]/p[1]/a[3]/text()')[0]
style = style_1 + '·' + style_2
complete = info.xpath('div[2]/p[1]/span/text()')[0]
introduce = info.xpath('div[2]/p[2]/text()')[0].strip()
word = info.xpath('div[2]/p[3]/span/text()')[0].strip('万字')
info_list = [title, author, style, complete, introduce, word]
all_info_list.append(info_list)
if __name__ == '__main__':
urls = ['http://a.qidian.com/?page={}'.format(str(i)) for i in range(1, 101)]
for url in urls:
get_info(url)
header = ['title', 'author', 'style', 'complete', 'introduce', 'word']
book = xlwt.Workbook(encoding='utf-8')
sheet = book.add_sheet('Sheet1')
for h in range(len(header)):
sheet.write(0, h, header[h])
i = 1
for list in all_info_list:
j = 0
for data in list:
sheet.write(i, j, data)
j += 1
i += 1
book.save('xiaoshuo.xls')
图片爬取:爬取妹子网图片
import requests
import os
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
}
path = 'G:/photo/'
def get_girlphoto(url):
html = requests.get(url, headers=headers)
selector = etree.HTML(html.text)
girlphoto_urls = selector.xpath('//div/ul/li/a/img/@data-original')
print(girlphoto_urls)
for item in girlphoto_urls:
data = requests.get(item, headers=headers)
with open(path + item[-10:], 'wb') as f:
f.write(data.content)
f.close()
if __name__ == '__main__': # 主函数
urls = ["https://www.mzitu.com/page/{}".format(str(i)) for i in range(2, 11)]
for url in urls:
get_girlphoto(url)
数据库存储:爬取豆瓣图书
import requests
import time # 导入库f
from lxml import etree
import pymysql
headers ={
'User-Agent' :'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
} # 请求头
def get_info(url): # 获取信息的函数
db = pymysql.connect("localhost", "root", "123456", "music")
cursor = db.cursor()
wb_data =requests.get(url ,headers=headers)
selector =etree.HTML(wb_data.text)
names =selector.xpath('//div[@class="pl2"]/a/text()')
titles =selector.xpath('//p[@class="pl"]/text()')
for name ,title in zip(names ,titles): # 获取指定组件指示的位置参数
'''data={
'name':name.strip(),
'author':title.split('/')[0],
'time':title.split('/')[1],
}'''
a=name.strip( )
b=title.split('/')[0]
c=title.split ('/')[1]
cursor.execute("insert into music values(' " + a + "',' " + b + "',' " + c + "')")
print(a,b,c)
db.commit()
cursor.close()
db.close()
if __name__ == '__main__':
urls =['https://music.douban.com/top250?start={}'.format(str(i)) for i in range(25 ,225 ,25)] # 构造23个URL,由酷狗页面前500条数据所占的页数决定
for url in urls:
get_info(url)
多线程爬虫:爬虫爬取糗事百科
Pycharm兼容爬虫
import requests
from queue import Queue
from lxml import etree
import threading
import time
class bsSpider(object):
"""docstring for ClassName"""
def __init__(self):
self.headers = {"User-Agent": "Mozilla/5.0"}
self.url = 'http://www.budejie.com/'
# URL队列
self.urlQueue = Queue()
# 响应html队列
self.resQueue = Queue()
def getUrl(self):
for p in range(1, 51):
url = self.url + str(p)
self.urlQueue.put(url)
def getHtml(self):
while True:
# 从urlQueue中拿url
url = self.urlQueue.get()
res = requests.get(url, headers=self.headers)
res.encoding = "utf-8"
html = res.text
# 放到响应队列
self.resQueue.put(html)
# 清除任务
self.urlQueue.task_done()
def gerText(self):
while True:
# 从urlQueue中拿url
html = self.resQueue.get()
parseHtml = etree.HTML(html)
title_list = parseHtml.xpath('//div[@class="j-r-list-c-desc"]/a/text()')
# 放到响应队列
for title in title_list:
self.show(title)
# 清除任务
self.resQueue.task_done()
def show(self,title):
print(title)
def run(self):
thList = []
self.getUrl() # 50url
# 创建请求线程,放到线程列表
for i in range(5):
thRes = threading.Thread(target=self.getHtml)
thList.append(thRes)
# 创建解析线程,放到线程列表
for i in range(5):
thParse = threading.Thread(target=self.gerText)
thList.append(thParse)
# 启动所有线程
for th in thList:
th.setDaemon(True) #守护线程,保证线程单独执行
th.start()
self.urlQueue.join() #判断队列是否为0,0:主线程继续执行,html 40
self.resQueue.join() #5个,
if __name__ == '__main__':
spider = bsSpider()
spider.run()
Pycharm不兼容,必须要Idea中运行
import requests
import os
from lxml import etree
import xlwt
import time
from multiprocessing import Pool
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
}
def get_girlphoto(url):
html = requests.get(url, headers=headers)
selector = etree.HTML(html.text)
purl=selector.xpath("//a[@class='resule_img_a']/@href")
for i in purl:
html = requests.get(i, headers=headers)
selector = etree.HTML(html.text)
titles=selector.xpath("//h4/em/text()")
addresses=selector.xpath("//span[@class='pr5']/text()")
dolors=selector.xpath("//*[@id='pricePart']/div[1]/span/text()")
people_names=selector.xpath("//a[@class='lorder_name']/@title")
for title,address,dolor,people_name in zip(titles,addresses,dolors,people_names):
info_list = [title,str(address).strip(),dolor,people_name]
if __name__ == '__main__': # 主函数
urls=["http://bj.xiaozhu.com/search-duanzufang-p{}-0/".format(str(i)) for i in range(1,10)]
start1=time.time()
for i in urls:
get_girlphoto(i)
end1=time.time()
print('单进程',end1-start1)
start2=time.time()
pool=Pool(processes=10)
pool.map(get_girlphoto,urls)
end2=time.time()
print('10进程',end2-start2)
爬虫:IP代理
免费代理服务器:https://www.xicidaili.com/nn/
import requests
proxies = { "http": "http://192.10.1.10:8080", "https": "http://193.121.1.10:9080", }
requests.get("http://targetwebsite.com", proxies=proxies)
手机爬虫:爬取百度文库文档
# -*- coding: utf-8 -*-
import Public
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver import ActionChains
import time
headers = {
"User-Agent": "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Mobile Safari/537.36"
}
def get_num(url):
driver = webdriver.PhantomJS()
driver.get(url)
html = driver.page_source
page_count_get = BeautifulSoup(html, 'lxml')
page_count_gets = page_count_get.find(class_='page-count')
pagecount = page_count_gets.get_text()
num = int(pagecount[1:])
temp = num
page1 = 1 # 从第一页开始爬取
# 获取结果
all = ''
while page1 <= temp:
x = 'pageNo-' + str(page1) # 当前页面元素
html = driver.page_source
soup = BeautifulSoup(html, 'lxml') # 载入html
soups = soup.find_all(id=x) # 找到所有id为x的元素
for each in soups:
text = each.get_text() # 获取元素里的文字
all += text
if num > 1 and text == '': # 如果页数大于1并且获取不到文章,就点击继续阅读
page = driver.find_element_by_css_selector("#html-reader-go-more") # 模拟鼠标点击 ActionChain()方法
pagebutton = driver.find_element_by_css_selector("#html-reader-go-more .banner-more-btn") # 继续阅读按钮的元素位置
ActionChains(driver).move_to_element(page).click(pagebutton).perform() # 执行鼠标单击行为
time.sleep(2)
num = num / 5
page1 = page1 - 1
elif num < 1 and text == '': # 如果页数小于1(这里指的是不需要再点击继续阅读)并且获取不到文章,就下滚
target = driver.find_element_by_id(x) # 找到id为x的元素,并定位,也就是当前正在爬取的页数
driver.execute_script("arguments[0].scrollIntoView();", target) # 模拟滚动到id为x的地方,也就是当前正在爬取的页数
time.sleep(2)
page1 = page1 - 1
page1 = page1 + 1
print(all)
四,案例
一,爬取美团商家数据
一,非美食类
import xlwt
import requests
from lxml import etree
import time
all_info_list = []
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
'Cookie': '_lxsdk_cuid=16bd560125bc8-0f8928eb371893-e323069-1fa400-16bd560125bc8;ci=1;rvct=1;_hc.v=fcff8e99-91bf-6483-6184-9a249bc61b80.1562654914;Hm_lvt_f66b37722f586a240d4621318a5a6ebe=1562749731;__utma=211559370.351478480.1562749734.1562749734.1562749734.1;__utmz=211559370.1562749734.1.1.utmcsr=baidu|utmccn=baidu|utmcmd=organic|utmcct=zt_search;client-id=da6ade00-71c3-4821-af5f-d539b16b5955;uuid=c2d42edd0bda4f44acc5.1563193350.1.0.0;_lx_utm=utm_source%3Dbaidu%26utm_campaign%3Dbaidu%26utm_medium%3Dorganic%26utm_content%3Dzt_search;__mta=213669163.1563193361179.1563193361179.1563193361179.1;_lxsdk_s=16bf595928c-1b-bd2-c98%7C%7C5'}
def get_info(url):
html = requests.get(url,headers=headers)
selector = etree.HTML(html.text)
titles = selector.xpath('//div[@class="list-item-desc-top"]/a/text()')
address = selector.xpath('//div[@class="item-site-info clearfix"]/span[2]/text()')
p=selector.xpath('//div[@class="item-eval-info clearfix"]/span[1]/text()')#奇数
r=selector.xpath('//div[@class="item-price-info"]/span/text()')#偶数
pingfens=[]
renjuns=[]
for i in range(0,len(p)-1,2):
pingfens.append(p[i])
for i in range(1, len(r) - 1,2):
renjuns.append(r[i])
for title,addres,pingfen,renjun in zip(titles,address,pingfens,renjuns):
info_list=[title,addres,pingfen+"分","¥"+renjun]
all_info_list.append(info_list)
print(info_list)
if __name__ == '__main__':
urls = ["https://bj.meituan.com/jiehun/c20198/pn{}/".format(str(i))
for i in range(1, 20)]
for url in urls:
get_info(url)
#get_info("https://bj.meituan.com/yundongjianshen/c20268/")
header = ['店名', '地址', '评分', '人均']
book = xlwt.Workbook(encoding='utf-8')
sheet = book.add_sheet('Sheet1')
for h in range(len(header)):
sheet.write(0, h, header[h])
i = 1
for list in all_info_list:
j = 0
for data in list:
sheet.write(i, j, data)
j += 1
i += 1
book.save('xiaoshuo.xls')
二,美食类
import xlwt
import requests
from lxml import etree
import re
import time
all_info_list = []
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
'Cookie':'_lxsdk_cuid=16bd560125bc8-0f8928eb371893-e323069-1fa400-16bd560125bc8;ci=1;rvct=1;_hc.v=fcff8e99-91bf-6483-6184-9a249bc61b80.1562654914;Hm_lvt_f66b37722f586a240d4621318a5a6ebe=1562749731;__utma=211559370.351478480.1562749734.1562749734.1562749734.1;__utmz=211559370.1562749734.1.1.utmcsr=baidu|utmccn=baidu|utmcmd=organic|utmcct=zt_search;client-id=da6ade00-71c3-4821-af5f-d539b16b5955;uuid=c2d42edd0bda4f44acc5.1563193350.1.0.0;_lx_utm=utm_source%3Dbaidu%26utm_campaign%3Dbaidu%26utm_medium%3Dorganic%26utm_content%3Dzt_search;__mta=213669163.1563193361179.1563193361179.1563193361179.1;_lxsdk_s=16bf595928c-1b-bd2-c98%7C%7C5' }
def get_info(url):
html = requests.get(url,headers=headers)
html2=str(html.text)[str(html.text).find("poiInfos"):str(html.text).find("comHeader")]
list=html2.split("poiId")
for i in list:
s=str(i)[str(i).find("title"):str(i).find("avgPrice")]
a=s[s.find("title"):s.find("avgScore")].strip('title":"')
title=a.strip('",')
b=s[s.find("address"):].strip('address":"')
addres=b.strip('",')
c=s[s.find("avgScore"):s.find("allCommentNum")].strip('avgScore":')
pingfen=c.strip(',')
d=s[s.find("allCommentNum"):s.find("address")].strip('allCommentNum":')
renjun=d.strip(',')
info_list = [title, addres, pingfen + "分", "¥" + renjun]
all_info_list.append(info_list)
print(info_list)
if __name__ == '__main__':
urls = ["http://bj.meituan.com/meishi/c54/pn{}/".format(str(i))
for i in range(1, 20)]
for url in urls:
get_info(url)
header = ['店名', '地址', '评分', '人均']
book = xlwt.Workbook(encoding='utf-8')
sheet = book.add_sheet('Sheet1')
for h in range(len(header)):
sheet.write(0, h, header[h])
i = 1
for list in all_info_list:
j = 0
for data in list:
sheet.write(i, j, data)
j += 1
i += 1
book.save('xiaoshuo.xls')
二,有道词典实现在线翻译
import requests
import re
import time
import urllib3
import hashlib
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'Origin':'http://fanyi.youdao.com/', #请求头最初是从youdao发起的,Origin只用于post请求
'Referer':'http://fanyi.youdao.com/', #Referer则用于所有类型的请求
}
class YoudaoFanyi:
def get_info(self,key):
data = { #各种数据,可以通过抓包找到
'i': key,
'from': 'AUTO',
'to': 'AUTO',
'smartresult': 'dict',
'client': 'fanyideskweb',
'salt': '15675741889781',
'sign': 'fa78728c931e9e682fae1c39c2b43b3a',
'doctype':'json',
'version':'2.1',
'keyfrom':'fanyi.web',
'action':'FY_BY_CLICKBUTTION', #判断按回车提交或者点击按钮提交的方式
'typoResult':'true'
}
url='http://fanyi.youdao.com/translate? smartresult=dict&smartresult=rule&sessionFrom=null' #有道的一个接口
res = requests.post(url, headers=headers,data=data)
youdaojson = res.json()
print('翻译的结果是:%s' % (youdaojson['translateResult'][0][0]['tgt']))
time.sleep(2)
if __name__ == '__main__':
youdao=YoudaoFanyi()
while True:
key = input("请输入你要翻译的文字('quit':退出): ").strip()
if key == 'quit':
break
youdao.get_info(key)