本周第一次接触爬虫,在https://scrapy-chs.readthedocs.io/zh_CN/latest/intro/tutorial.html上简单入门后便在马神的指导下写了一些简单的爬虫。
因为以前web,html,css,xpath都没有接触过,看网页的源码时很是吃力。也一开始也不知道定位的方法,走了不少的弯路,以此来记录一下。
一开始在用scrapy库爬之前先在b站上看了北理工嵩天老师的视频,但是那个视频主要讲的是request库和re库, 还有美丽汤。跟 着敲一遍代码也只是似懂非懂,毕竟正则表达式都忘得差不多了。这里讲其中的部分源码放上来,方便有需要的童鞋。
1.爬取中国大学排名:
# coding=utf-8
import requests
from bs4 import BeautifulSoup
import bs4
def getHTMLText(url):
try:
r=requests.get(url,timeout=30)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
return ""
def fillUnivList(ulist,html):
soup=BeautifulSoup(html,"html.parser")
for tr in soup.find("tbody").children:
if isinstance(tr,bs4.element.Tag):
tds=tr("td")
ulist.append([tds[0].string,tds[1].string,tds[2].string])
pass
def printUnivList(ulist,num):
print "{:^10}\t{:^6}\t{:^10}".format("排名","学校名称","总分")
for i in range(num):
u=ulist[i]
print("{:^10}\t{:^6}\t{:^10}".format(u[0],u[1],u[2]))
def main():
uinfo=[]
url="http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html"
html=getHTMLText(url)
fillUnivList(uinfo,html)
printUnivList(uinfo,20)
main()
2.爬取股市信息:
import requests
import traceback
from bs4 import BeautifulSoup
import re
def getHTMLText(url,code='utf-8'):
try:
r=requests.get(url,timeout=30)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
return ""
def getStockList(lst,stockURL):
html=getHTMLText(stockURL)
soup=BeautifulSoup(html,'html.parser')
a=soup.find_all('a')
for i in a:
try:
href=i.attrs['href']
lst.append(re.findall(r'[s][hz]\d{6}',href)[0])
except:
continue
return ""
def getStockInfo(lst,stockURL,fpath):
for stock in lst:
url=stockURL+stock+".html"
html=getHTMLText(url)
try:
if html=="":
continue
infoDict={}
soup=BeautifulSoup(htnl,'html.parser')
stockInfo=soup.find('div',attrs={'class':'stock-bets'})
name=stockInfo.find_all(attrs={'class':'bets-name'})[0]
infoDict.update({'stockname':name.text.split()[0]})
keyList=stockInfo.find_all('dt')
valueList=stockInfo.find_all('dd')
for i in range(len(keyList)):
key=keyList[i].text
val=valueList[i].text
infoDict[key]=val
with open(fpath,'a',encoding='utf-8') as f:
f.write(str(infoDict)+'\n')
except:
traceback.print_exc()
continue
def main():
stock_list_url='http://quote.eastmoney.com/stcoklist.html'
stock_info_url='https://gupiao.baidu.com/stock/'
output_file='e://baidustockinfo.txt'
slist=[]
getStockList(slist,stock_list_url)
getStockInfo(slist,stock_info_url,output_file)
main()
3.爬取淘宝商品价格
#!usr/bin/python
# -*- coding: utf-8 -*-
import requests
import re
def getHTMLText(url):
try:
r=requests.get(url,timeout=30)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
return ""
def parsePage(ilt,html):
try:
plt=re.findall(r'\"view_price\"\:\"[\d\.]*\"',html)
tlt=re.findall(r'\"raw_title\"\:\".*?\"',html)
for i in range(len(plt)):
price=eval(plt[i].split(':')[1])
title=eval(tlt[i].split(':')[1])
ilt.append([price,title])
except:
price("")
def printGoodsList(ilt):
tlpt="{:4}\t{:8}\t{:16}"
print(tlpt.format("num","price","goodsname"))
count=0
for g in ilt:
count=count+1
print(tlpt.format(count,g[0],g[1]))
def main():
goods=str('school bag')
depth=2
start_url='https://s.taobao.com/search?q='+goods
infoList=[]
for i in range(depth):
try:
url=start_url+'&s='+str(48*i)
html=getHTMLText(url)
parsePage(infoList,html)
except:
continue
printGoodsList(infoList)
main()
scrapy库使用:
scrapy startproject name
genspider name
爬虫文件:
class …
def parse(self,response)…
response指返回的selector对象,response.xpath(’ ‘) .extract()返回一个表,其中xpath() 的用法一开始用不熟练,费时不少。一开始连 xpath()返回什么和xpath().extract()返回什么都分不清,有事基础不牢有些坑是不得不踩。
因为政策文件较多,首先爬取这些文件的URL(接下来的学习中应该会找到更为高效的方法),给出爬取URL的代码:
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
import scrapy
class DmozSpider(scrapy.Spider):
name = "extr"
allowed_domains = ["dmoz.org"]
start_urls = [
'http://www.szkj.gov.cn/news/20171227/tfpmwnpw-0jt7-zwta-wffv-5o1ar32e34.html',
]
# def parse(self,response):
# filename=response.url.split('/')[-2]
# with open(filename,'wb')as f:
# f.write(response.body)
#
#/html/body/div[4]/div[1]/div/ul
#/html/body/div[4]/div[1]/div/ul/li[1]
#/html/body/div[4]/div[1]/div/ul/li[1]/div[1]
#/html/body/div[4]/div[1]/div/ul/li[1]/div[1]/a
#/html/body/div[4]/div[1]/div/ul/li[1]/div[2] time
#/html/body/div[4]/div[1]/div/ul/li[1]/div[2]
def parse(self, response):
i=0
rel=response.xpath('/html/body/div[4]/div[1]/div/ul/li')
# doc={"url":[],"title":[],"time":[]}
doc=[]
file_object=file(r"E:\python\try\art\url.txt","a+")
for father in response.xpath('/html/body/div[4]/div[1]/div/ul/li'):
if "2017" in father.xpath('./div[2]/text()').extract()[0]:
dict={}
print"====================%d========!!!!!!!!!!!!============================="%i
art_url=father.xpath('./div[1]/a/@href').extract()[0]
dict['url']=art_url
print(art_url)
file_object.write(art_url)
file_object.write('\t')
title=father.xpath('./div[1]/a/@title').extract()[0]
dict['title']=title
print(title)
file_object.write(title)
file_object.write('\t')
time=father.xpath('./div[2]/text()').extract()[0]
dict['time']=time
print(time)
file_object.write(time)
file_object.write('\n')
doc.append(dict)
i=i+1
print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
for d in doc:
# print(d["url"]+" "+d["title"]+" "+d["time"])
print("\'"+"http://www.szkj.gov.cn"+d["url"]+"\',")
接着通过这些链接爬取政策文件中的内容,并将这些内容写到本机中,代码:
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
import scrapy
class DmozSpider(scrapy.Spider):
name = "art"
allowed_domains = ["dmoz.org"]
start_urls = [
'http://www.szkj.gov.cn/news/20171227/tfpmwnpw-0jt7-zwta-wffv-5o1ar32e34.html',
'http://www.szkj.gov.cn/news/20171117/hjojwmv5-yrwm-jaxe-izic-x9b151x2iz.html',
'http://www.szkj.gov.cn/news/20171027/2rjrxx0w-nw76-ru14-b2zd-gegfavg5hn.html',
'http://www.szkj.gov.cn/news/20171018/o0hknmkk-rcll-pffs-k5vw-4b5w65u3sw.html',
'http://www.szkj.gov.cn/news/20170929/14ftvjse-90e4-7a2o-0s6w-gy3zm42v82.html',
'http://www.szkj.gov.cn/news/20170927/j0z3mctd-0zaz-uhd7-lw66-bzdt6owoaa.html',
'http://www.szkj.gov.cn/news/20170920/xe5v6qrk-40ip-i1ie-f19q-0afi5pbakf.html',
'http://www.szkj.gov.cn/news/20170825/ltehbi1t-atph-esk3-8nvo-qjvo3edxfg.html',
'http://www.szkj.gov.cn/news/20170722/zok9eaum-nbpm-fg7q-9dst-i8y1kfkbm3.html',
'http://www.szkj.gov.cn/news/20170721/ltqa0c66-lu96-fvh5-pu6h-7byyzvifnn.html',
'http://www.szkj.gov.cn/news/20170630/jhgxoro3-rwys-9hw7-dlhr-ssxizm6yg6.html',
'http://www.szkj.gov.cn/news/20170414/9uso4cya-9clp-khwc-72u6-sszwklb6fp.html',
'http://www.szkj.gov.cn/news/20170109/fac7j3kh-hzgz-yfwd-qjwd-pvxu3ubzsk.html'
]
# def parse(self,response):
# filename=response.url.split('/')[-2]
# with open(filename,'wb')as f:
# f.write(response.body)
#
#/html/body/div[4]/div[1]/div/div[2]
#/html/body/div[4]/div[1]/div/div[2]/div[1]
#
#
#/html/body/div[4]/div[1]/div main
#/html/body/div[4]/div[1]/div/h1 title
#/html/body/div[4]/div[1]/div/div[1] time
#/html/body/div[4]/div[1]/div/div[2] content
#
#
#/html/body/div[4]/div[1]/div/div[2]
#/html/body/div[4]/div[1]/div/div[2]/div[5]/text()
#/html/body/div[4]/div[1]/div[1]/div/span/a
file_path='E:\python\try\art\content.txt'
def parse(self, response):
i=0
arr_str=[]
file_object=file(r"E:\python\try\art\content.txt","a+")
title=response.xpath('/html/body/div[4]/div[1]/div/h1/text()').extract()[0]
file_object.write(title+'\n')
time=response.xpath('/html/body/div[4]/div[1]/div/div[1]/text()').extract()[0]
file_object.write(time+'\n')
# with open(r'E:\python\try\art\content.txt','w') as file_object:
for rel in response.xpath('/html/body/div[4]/div[1]/div/div[2]/div'):
print "=========================================%d==============================="%i
try:
son=rel.xpath('string(.)').extract()[0].replace(u'\xa0', u' ')
print(type(son))
print(len(son))
file_object.write(son)
print(son)
i=i+1
except:
continue
file_object.write('\n\n\n\n\n\n\n\n\n\n')
file_object.close()
#txtName = "codingWord.txt"
#f=file(txtName, "a+")
#for i in range(1,100):
# if i % 2 == 0:
# new_context = "C++" + '\n'
# f.write(new_context)
# else:
# new_context = "Python" + '\n'
# f.write(new_context)
#f.close()
# for tr in rel:
#
# str = tr.xpath('./td[1]/text()').extract()
# print(str[0])
# print(tr.xpath('./td[2]/div/@align').extract()[0])
# print(tr.xpath('./td[2]/div/text()').extract()[0])
# print(tr.xpath('./td[4]/text()').extract()[0])
# print "========================================================================"
# rel.xpath('/td[1]').extract()
# print "========================================================================"
# print response.xpath('/html/body/div[3]/div/div[2]/div/div[3]/div/table/tbody/tr[1]/td[1]').extract()
这周的部分成果,希望下周能够更深入得学习爬虫的相关知识。不过不知道数据挖掘这门课的爬虫到底能学到怎样的一种程度才算合适。
旁边的马神早已经开始其进阶的道路,希望他代码敲得慢些,等等我。。。。。。。