Python网络爬虫
标准爬取数据格式
import requests
def getHTMLText(url):
try:
r=requests.get(url)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
return "产生异常"
if __name__=="__main__":
url ="http://www.baidu.com"
print (getHTMLText(url))
一般爬取数据
import requests
url="http://item.jd.com/2967929.html"
try:
r=requests.get(url)
r.raise_for_status()
r.encoding=r.apparent_encoding
print r.text
except:
print "产生异常"
改变user-agent访问网站爬取信息
import requests
url="http://www.amazon.cn/gp/product/B01M8L5Z3Y"
try:
kv={'user-agent':'Mozilla/5.0'}
r=requests.get(url,headers=kv)
r.raise_for_status()
r.encoding=r.apparent_encoding
print r.text[1000:2000]
except:
print "爬取失败"
爬取百度搜索Python后的内容
import requests
kv={'wd':'Python'}
try:
r=requests.get("http://www.baidu.com/s",params=kv)
r.raise_for_status()
r.encoding=r.apparent_encoding
print len(r.text)
print r.text
except:
print "产生异常"
从网上爬取并下载资源
import requests
import os
root="/home/sun/Python/pics/"
url="http://image.nationalgeographic.com.cn/2017/0211/20170211061910157.jpg"
path=root+url.split('/')[-1]
try:
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path):
r=requests.get(url)
with open(path,'wb') as f:
f.write(r.content)
f.close()
print "文件保存成功"
else:
print "文件已存在"
except:
print "爬取失败"
查询ip归属地,程序运行复杂,代码应该没错
import requests
url="https://m.ip138.com/iplookup.asp?ip="
try:
r=requests.get(url+'202.204.80.112')
r.raise_for_status()
r.encoding=r.apparent_encoding
print (r.text[-400:])
except:
print "爬取失败"
使用BeautifulSoup
import requests
r=requests.get("http://www.baidu.com")
r.encoding=r.apparent_encoding
demo=r.text
from bs4 import BeautifulSoup
soup=BeautifulSoup(demo,"html.parser")
print soup.title
print soup.title.name
print soup.a
tag=soup.a
print tag.attrs
print tag.attrs['href']
print tag
#print soup.prettify()
爬取中国大学排名并格式化输出
import requests
import bs4
from bs4 import BeautifulSoup
def getHTMLText(url):
try:
r=requests.get(url)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
return ""
def fillUnivList(ulist,html):
soup=BeautifulSoup(html,"html.parser")
for tr in soup.find('tbody').children:
if isinstance(tr,bs4.element.Tag):
tds=tr('td')
ulist.append([tds[0].string,tds[1].string,tds[3].string])
def printUnivList(ulist,num):
print ("{0} \t {1} \t\t {2}".format("排名","学校","总分"))
#print ("%s \t %s \t %s"%("排名","学校","总分"))
for i in range(num):
u=ulist[i]
print ("%s \t %s \t %s "%(u[0],u[1],u[2]))
def main():
uinfo=[]
url="http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html"
html=getHTMLText(url)
fillUnivList(uinfo,html)
printUnivList(uinfo,5) #数字表示显示前几
main()
后续学习中。。。