两个实例仅供参考,不具有实用作用。因为网页都需要登录。
淘宝商品定向爬虫.py
#获取淘宝搜索页面信息,提取其中的商品名称和价格
import requests
import re
def getHtMLText(url):
try:
r=requests.get(url,timeout=30)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
return ""
def parsePage(ilt,htlm):
try:
plt=re.findall(r'\"realPrice\"\:\[\d\.]*\"',htlm)
tlt=re.findall(r'/span>\:\".*?\"',htlm)
for i in range(len(plt)):
price=eval(plt[i].split(':')[1])
title=eval(tlt[i].split(':')[1])
ilt.append(price,title)
except:
print("")
def printGoodsList(ilt):
tplt="{:4}\t{:8}\t{:16}"
print(tplt.format("序号","价格","商品名称"))
count=0
for g in ilt:
print(tplt.format(count,g[0],g[1]))
def main():
goods='牛仔裤'
depth=2
star_url='https://ai.taobao.com/search/index.htm?key='+goods
infolist=[]
for i in range(depth):
try:
url=star_url+'&s='+str(44*i)
html=getHtMLText(url)
parsePage(infolist,html)
except:
continue
printGoodsList(infolist)
main()
股票数据定向爬虫.py
import requests
from bs4 import BeautifulSoup
import traceback
import re
def getHtMLText(url,code='utf-8'):
try:
r=requests.get(url,timeout=30)
r.raise_for_status()
r.encoding=code
return r.text
except:
return ""
def getStockList(lst,stockURL):
html=getHtMLText(stockURL)
soup=BeautifulSoup(html,'html.parser')
a=soup.find_all('a')
for i in a:
try:
href=i.attrs['href']
lst.append(re.findall(r'[s][hz]\d{6}',href)[0])
except:
continue
def getStockInfo(lst,stockURL,fpath):
count=0
for stock in lst:
url=stockURL+stock+'.html'
html=getHtMLText(url)
try:
if html=="":
continue
infoDict={
}
soup=BeautifulSoup(html,'html.parser')
stockInfo=soup.find('div',attrs={
'class':'stock-bets'})
name=stockInfo.find_all(attrs={
'class':'bets-name'})[0]
infoDict.update({
'股票名称':name.text.split()[0]})
KeyList=stockInfo.find_all('dt')
ValueList=stockInfo.find_all('dd')
for i in range(len(KeyList)):
key=KeyList[i].text
value=ValueList[i].text
infoDict[key]=value
with open(fpath,'a',encoding='utf-8') as f:
f.write(str(infoDict)+'\n')
count=count+1
print("\r当前速度:{:.2f}%".format(count*100/len(lst)),end="")
except:
count = count + 1
print("\r当前速度:{:.2f}%".format(count * 100 / len(lst)), end="")
traceback.print_exc()
continue
def main():
stock_list_url='http://quote.eastmoney.com/stock.html'
stock_info_url='https://gupiao.baidu.com/stock/'
output_file='D:/Python/Workspace/爬虫基础/stock.txt'
slist=[]
getStockList(slist,stock_list_url)
getStockInfo(slist,stock_info_url,output_file)
main()
PS:学习链接 https://www.icourse163.org/learn/BIT-1001870001?tid=1206951268#/learn/announce