Python网络爬虫与信息提取(四):网络爬虫之实战

淘宝商品比价定向爬虫

Python网络爬虫与信息提取(四):网络爬虫之实战_第1张图片Python网络爬虫与信息提取(四):网络爬虫之实战_第2张图片Python网络爬虫与信息提取(四):网络爬虫之实战_第3张图片

import requests
import re


def getHTMLText(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""

# 整个程序的关键
# 用了正则表达式而没有用BeautifulSoup


def parsePage(ilt, html):
    try:
        plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html)
        tlt = re.findall(r'\"raw_title\"\:\".*?\"', html)
        for i in range(len(plt)):
            price = eval(plt[i].split(':')[1])  # eval函数将字符串的最外层的单引号或双引号去掉
            title = eval(tlt[i].split(':')[1])
            ilt.append([price, title])
    except:
        print("")


def printGoodsList(ilt):
    tplt = "{:4}\t{:8}\t{:16}"
    print(tplt.format('序号', '价格', '商品名称'))
    count = 0
    for q in ilt:
        count = count + 1
        print(tplt.format(count, g[0], g[1]))


def main():
    goods = '书包'
    depth = 2
    start_url = 'https://s.taobao.com/search?q=' + goods
    infoList = []
    for i in range(depth):
        try:
            url = start_url + '&s=' + str(44 * i)
            html = getHTMLText(url)
            parsePage(infolist, html)
        except:  # 如果某一个页面的解析出了问题,那我们继续下一个页面的解析而不影响整个程序的执行
            continue
    printGoodsList(infoList)


main()

 

股票数据定向爬虫

Python网络爬虫与信息提取(四):网络爬虫之实战_第4张图片Python网络爬虫与信息提取(四):网络爬虫之实战_第5张图片Python网络爬虫与信息提取(四):网络爬虫之实战_第6张图片Python网络爬虫与信息提取(四):网络爬虫之实战_第7张图片

import requests
import re
import traceback
from bs4 import BeautifulSoup
import bs4
def getHTMLText(url):
    try:
        r = requests.get(url,timeout = 30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
         return""
fpath = 'D://gupiao.txt'
html = getHTMLText('https://hq.gucheng.com/gpdmylb.html')
soup = BeautifulSoup(html,'html.parser')
a = soup.find_all('a')
lst=[]
for i in a:
    try:
        href = i.attrs['href']
        lst.append(re.findall(r"[S][HZ]\d{6}", href)[0])
    except:
        continue
lst = [item.lower() for item in lst] #将爬取信息转换小写
count = 0
for stock in lst:
    url = 'https://gupiao.baidu.com/stock/' + stock + ".html"
    html = getHTMLText(url)
    try:
        if html =="":
            continue
        infoDict = {}
        soup = BeautifulSoup(html,'html.parser')
        stockInfo = soup.find('div',attrs={'class':'stock-bets'})
             
        if isinstance(stockInfo,bs4.element.Tag):    #判断类型
            name = stockInfo.find_all(attrs={'class':'bets-name'})[0]
            infoDict.update({'股票名称':name.text.split()[0]})            
            keylist = stockInfo.find_all('dt')
            valuelist = stockInfo.find_all('dd')
            for i in range(len(keylist)):
                key = keylist[i].text
                val = valuelist[i].text
                infoDict[key] = val
                 
        with open(fpath,'a',encoding='utf-8') as f:
            f.write( str(infoDict) + '\n')
            count = count + 1
            print("\r当前速度:{:.2f}%".format(count*100/len(lst)),end="")
    except:
        count = count + 1
        print("\r当前速度:{:.2f}%".format(count*100/len(lst)),end="")
        traceback.print_exc()
        continue
import requests
from bs4 import BeautifulSoup
import traceback
import re


def getHTMLText(url):
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        #r.apparent_encoding将获得的html页面的文本内容和进行分析,由程序来判断其中的文本可能使#用什么样的编码方式
        #r.encoding只是从html的头文件中去解析它可能用到什么样的coding方式
        return r.text
    except:
        return ""

# 获得股票列表


def getStockList(lst, stockURL):
    html = getHTMLText(stockURL)
    soup = BeautifulSoup(html, 'html.parser')
    a = soup.find_all('a')
    for i in a:
        try:
            href = i.attrs['href']
            lst.append(re.findall(r"[s][hz]\d{6}", href)[0])
        except:
            continue

# 获得每一支个股的股票信息
# 保存所有股票的信息列表,获得股票信息的url网站,将来要把股票存到文件的文件路径


def getStockInfo(lst, stockURL, fpath):
    for stock in lst:
        url = stockURL + stock + ".html"
        html = getHTMLText(url)
        try:
            if html == "":  # 判断是不是空页面
                continue
            infoDict = {}  # 存储当前从页面中返回的所有个股信息
            soup = BeautifulSoup(html, 'html.parser')
            stockInfo = soup.find('div', attrs={'class': 'stock-bets'})

            name = stockInfo.find_all(attrs={'class': 'bets-name'})[0]
            infoDict.update({'股票名称': name.text.split()[0]})

            keyList = stockInfo.find_all('dt')
            valueList = stockInfo.find_all('dd')
            for i in range(len(keyList)):
                key = keyList[i].text
                val = valueList[i].text
                infoDict[key] = val  # 字典中可以直接使用t=value来向字典中新增元素

            with open(fpath, 'a', encoding='utf-8') as f:
                f.write(str(infoDict) + '\n')
                print('\r当前速度:{:.2f}%'.format(count * 100 / len(lst)), end='')
                #\r能将打印的字符串的光标提到当前这一行的头部,下一次再进行打印的时候,打印的信息就会覆盖之前打印的内容,从而实现不换行
        except:
            traceback.print_exc()
            print('\r当前速度:{:.2f}%'.format(count * 100 / len(lst)), end='')
            continue


def main():
    stock_list_url = 'https://quote.eastmoney.com/stocklist.html'
    stock_info_url = 'https://gupiao.baidu.com/stock/'
    output_file = 'D:/BaiduStockInfo.txt'
    slist = []  # 存股票信息
    getStockList(slist, stock_list_url)
    getStockInfo(slist, stock_info_url, output_file)


main()

requests-beautifulsoup库这样的技术路线,速度都不会提高很快
如果对速度有很高的的要求,用scrapy库
 

你可能感兴趣的:(网络爬虫,python)