案例集锦

案例一: 京东商品页面的爬取

import requests
url = 'https://item.jd.com/2967929.html'
try:
  r = requests.get(url)
  r = raise_for_status()
  r.encoding = r.apparent_encoding
  print(r.text[:1000])
except:
  print('FINDING ERRORS')

案例二:亚马逊商品页面的爬取

由于amazon禁止python访问,要把headers信息替换成浏览器

import requests
url = 'https://www.amazon.cn/gp/product/B01M8L5Z3Y'
try:
  kv = {'user-agent' : 'Mozilla/5.0'}
  r = requests.get(url, headers =kv)
  r = raise_for_status()
  r.encoding = r.apparent_encoding
  print(r.text[:1000])
except:
  print('FINDING ERRORS')

案例三:百度360关键词提交搜索

百度的关键词接口
http://www.baidu.com/s?wd=keyword
360的关键词接口
http://www.so.com/s?q=keyword

import requests
keyword = 'Python'
try:
  kv = {'wd' : keyword}
  r = requests.get('http://www.baidu.com/s', params =kv)
  print(r.request.url)
  r = raise_for_status()
  r.encoding = r.apparent_encoding
  print(r.text[:1000])
except:
  print('FINDING ERRORS')
import requests
keyword = 'Python'
try:
  kv = {'q' : keyword}
  r = requests.get('http://www.so.com/s', params =kv)
  print(r.request.url)
  r = raise_for_status()
  r.encoding = r.apparent_encoding
  print(r.text[:1000])
except:
  print('FINDING ERRORS')

案例四:图片爬取并且存储

import requests
import os
url = 'http://image.nationalgeographic.com.cn/2017/0211/20170211061910157.jpg'
root = 'D://pics//'
path = root + url.split('/')[-1]
try:
  if not os.path.exists(root):
    os.mkdir(root)
  if not os.path.exists(path):
    r = requests.get(url)
    with open(path, 'wb') as f:
      f.write(r.content) # 保存二进制格式,即图片
      f.close()
      print('saved files')
  else:
    print('files have already existed')
except:
  print('Failure')

案例五:IP地址查询

import requests
url = 'http://m.ip138.com/ip.asp?ip='
try:
  r = requests.get(url+'202.204.80.112')
  r.raise_for_status()
  r.encoding = r.apparent_encoding
  print(r.text[-500:])
except:
  print('Failed')

案例六:大学排名

输入: 大学排名URL链接
输出: 大学排名信息的屏幕输出(排名,大学名称,总分)
技术路线: Requests, BeatifulSoup
步骤一: 从网页上获取大学排名网页内容
步骤二: 提取网页内容中信息到合适的数据结构
步骤三:利用数据结构展示并输出结果

import
def getHTMLText(url):
 try:
   r = requests.get(url, timeout = 30)
   r.rasie_for_status
   r.encoding = r.apparent_encoding
   return r.text
  except:
     print('ERRORS')

def fillUnivList(ulist, html):
 soup = BeautifulSoup(html, 'html.parser')
 for tr in soup('tbody'). children:
     if isinstance(tr, bs4.element.Tag):
       tds = tr('td') # find_all()
       ulist.append([tds[0].string, tds[1].string, tds[2].string])

def printUnivList(ulist, num):
 print('{: ^10}\t{: ^6}\t{: ^10}\t'.format('Ranking', 'School Name', 'Marks'))
 for i in range(num):
   u = ulist[i]
   print('{: ^10}\t{: ^6}\t{: ^10}\t'.format(u[0], u[1], u[2]))

def main()
   uinfo =[]
   url = 'http: //www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html'
   html = getHTMLText(url)
   fillUnivList(uinfo, html)        

案例七

目标: 获取淘宝搜索页面的信息,提取其中的商品名称和价格
理解: 淘宝的搜索接口,翻页的处理

import re
import requests

def getHTMLText(url):
  try:
    r = requests.get(url, timeout = 30)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    return r.text
  except:
    return 'Error'

def parsePage(ilt, html):
  try:
    plt = re.findall(r'\"view_price\"\: \"[\d\.]*\" ', html)
    tlt = re.findall(r' \"raw_title\"\:\".*?\" ',html)
    for i in range(len(plt)):
      price = eval(plt[i].split(':')[1])
      title = eval(tlt[i].split(':')[1])
      ilt.append([price, title])
  except:
    print('Errors')


def printGoodsList(ilt):
  tplt = "{:4}\t{:8}\t{:16}"
  print (tplt.format('NO', 'PRICE', 'ITEM NAME')):
  count =0
  for q in ilt:
    count  = count +1
    print(tplt.format(count, g[0], g[1]))

def main()
  goods ='bags'
  depth = 2
  start_url = 'https://s.taobao.com/search? q = ' + goods
  infoList =[ ]
  for i in range(depth)
    try: 
      url = start_url + '&s=' + str(44*i)
      html = getHTMLText(url)
      parsePage(infoList, html)

案例八 股票数据定向爬虫

目标:获取上交所和深交所所有股票的名称和交易信息
输出:保存在文件中
步骤:
步骤一: 从东方财富网获取股票列表
步骤二: 根据股票列表逐个到百度股票获取个股信息
步骤三: 将结果保存到文件中

import traceback
import re
from bs4 import BeautifulSoup
import bs4

def getHTMLText(url):

def getStockList(lst, stockURL):
 html = getHTMLText(stockURL)
 soup = Beautifuloup(html, 'html.parser')
 a = soup.find_all('a')
 for i in a:
   try:
      href = i.attrs['href']
     lst.append(re.findall(r'[s][hz]\d{6}', href)[0])
   except:
     continue
 
def getStockInfo(lst, stockURL, fpath):
 for stock in lst:
   url = stockURL + stock + '.html'
   html = getHTMLText(url)
 try:
   if html == ''
     continue
   infoDict =[ ]
   soup = BeautifulSoup(html, 'html.parser')
   stockInfo = soup.find('div', attrs ={'class' : 'stock-bets'})
   name = stockInfo.find_all(attrs = {'class' : 'bets -name'})[0]
  
   infoDict.update({'StockName': name.text.split()[0]})
   keyList = stockInfo.find_all('dt')
   valueList = stockInfo.find_all('dd')
   for i in range(len(keyList)):
     key = keyList[i].text
     val = valueList[i].text
     infoDict[key] = val
   
   with open(fpath, 'a', encoding = 'utf-8') as f:
     f.write(str(infoDict) + '\n')
 except:
     traceback.print_exc()
     continue

def main():
 stock_list_url = 'http://quote.eastmoney.com/stock'
 stock_info_url = 'httpos://gupiao.baidu.com/stock/'
 output_file = 'D://BaiduStockInfo.txt'
 slist = [ ]
 getStockList(slist, stock_list_url)
 getStockInfo(slist, stock_info_url, output_file)

案例九 Scrappy股票数据定向爬虫

import scrapy
import re

class StockSpider(scrapy.Spider)
  name = 'stocks'
  start_urls = ['http://quote.eastmoney.com/stocklist.html']

def parse(self, response):
  for href in reponse.css('a::attr(href)').exact():
     try:
        stock = re.findall(r'[s][hz]\d{6}', href) [0]
        url = 'https://gupiao.baidu.com/stock' + stock + '.html'
        yield scrapy.Request(url, callback = self.parse_stock)
    except:
        continue

def parse_stock(self, reponse):
  infoDict ={ }
  stockInfo = response.csss('.stock-bets')
  name = stockInfo.css('.bets-name').extract()[0]
  keyList = stockInfo.css('dt').extract()
  valuleList = stockInfo.css('dd').extract()
  for i in range(len(keyList)):
    key = re.findall(r'>.*', keyList[i][0][1:-5])
    try:
      val = re.findall(r'\d+\.?.*', valueList[i])[0][0:-5]
    except:
      val = '--'

  infoDict.update(
      { 'stockName': re.findall('\s.*\(', name)[0].split()[0] + re.findall('\>.*\<', name) [0][1:-1]}}

# pipeline
class BaidustocksPipeline(object):
  def process_item(self, item, spider):
    return item

class BaidustocksInfoPipeline(object):
    def open_spider(self, spider):
        self.f = open('BaiduStockInfo.txt', 'w')
    
    def close_spider(self, spider)
        self.f.close()

    def process_item(self, item, spider):
        try:
            line = str(dic(item)) + '\n'
            self.f.write(line)
        except:
            pass

ITEM_PIPELINES = {
  'BaiduStocks.pipelines.BaidustockInfoPipeline: ' 300,}

你可能感兴趣的:(案例集锦)