Python爬虫

Python网络爬虫

标准爬取数据格式

 import requests
 def getHTMLText(url):
 try:
    r=requests.get(url)
    r.raise_for_status()
    r.encoding=r.apparent_encoding
    return r.text
 except:
    return "产生异常"
 if __name__=="__main__":
    url ="http://www.baidu.com"
    print (getHTMLText(url))

一般爬取数据

 import requests
 url="http://item.jd.com/2967929.html"
 try:
    r=requests.get(url)
    r.raise_for_status()
    r.encoding=r.apparent_encoding
    print r.text
 except:
    print "产生异常"

改变user-agent访问网站爬取信息

import requests
url="http://www.amazon.cn/gp/product/B01M8L5Z3Y"
try:
   kv={'user-agent':'Mozilla/5.0'}
   r=requests.get(url,headers=kv)
   r.raise_for_status()
   r.encoding=r.apparent_encoding
   print r.text[1000:2000]
except:
   print "爬取失败" 

爬取百度搜索Python后的内容

import requests
kv={'wd':'Python'}
try:
   r=requests.get("http://www.baidu.com/s",params=kv)
   r.raise_for_status()
   r.encoding=r.apparent_encoding
   print len(r.text)
   print r.text
except:
   print "产生异常"

从网上爬取并下载资源

import requests
import os
root="/home/sun/Python/pics/"
url="http://image.nationalgeographic.com.cn/2017/0211/20170211061910157.jpg"
path=root+url.split('/')[-1]
try:
  if not os.path.exists(root):
     os.mkdir(root)
  if not os.path.exists(path):
     r=requests.get(url)
     with open(path,'wb') as f:
        f.write(r.content)
        f.close()
        print "文件保存成功"
  else:
      print "文件已存在"
except:
    print "爬取失败"

查询ip归属地,程序运行复杂,代码应该没错

 import requests
 url="https://m.ip138.com/iplookup.asp?ip="
 try:
    r=requests.get(url+'202.204.80.112')
    r.raise_for_status()
    r.encoding=r.apparent_encoding
    print (r.text[-400:])
 except:
    print "爬取失败"

使用BeautifulSoup

 import requests
 r=requests.get("http://www.baidu.com")
 r.encoding=r.apparent_encoding
 demo=r.text
 from bs4 import BeautifulSoup
 soup=BeautifulSoup(demo,"html.parser")  
 print soup.title
 print soup.title.name
 print soup.a
 tag=soup.a
 print tag.attrs
 print tag.attrs['href']
 print tag
 #print soup.prettify()

爬取中国大学排名并格式化输出

 import requests
 import bs4
 from bs4 import BeautifulSoup
 def getHTMLText(url):
 try:
    r=requests.get(url)
    r.raise_for_status()
    r.encoding=r.apparent_encoding
    return r.text
 except:
    return ""

 def  fillUnivList(ulist,html):
    soup=BeautifulSoup(html,"html.parser")
      for tr in soup.find('tbody').children:
        if isinstance(tr,bs4.element.Tag):
           tds=tr('td')
           ulist.append([tds[0].string,tds[1].string,tds[3].string])

 def printUnivList(ulist,num):
     print ("{0} \t {1} \t\t {2}".format("排名","学校","总分"))
     #print ("%s \t %s \t %s"%("排名","学校","总分"))
     for i in range(num):
        u=ulist[i]
        print ("%s \t %s \t %s "%(u[0],u[1],u[2]))

 def main():
    uinfo=[]
    url="http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html"
    html=getHTMLText(url)
    fillUnivList(uinfo,html)
    printUnivList(uinfo,5) #数字表示显示前几
 main()

后续学习中。。。

你可能感兴趣的:(Python,python)