淘宝商品比价定向爬虫
import requests import re def getHTMLText(url): try: r = requests.get(url, timeout=30) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: return "" # 整个程序的关键 # 用了正则表达式而没有用BeautifulSoup def parsePage(ilt, html): try: plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html) tlt = re.findall(r'\"raw_title\"\:\".*?\"', html) for i in range(len(plt)): price = eval(plt[i].split(':')[1]) # eval函数将字符串的最外层的单引号或双引号去掉 title = eval(tlt[i].split(':')[1]) ilt.append([price, title]) except: print("") def printGoodsList(ilt): tplt = "{:4}\t{:8}\t{:16}" print(tplt.format('序号', '价格', '商品名称')) count = 0 for q in ilt: count = count + 1 print(tplt.format(count, g[0], g[1])) def main(): goods = '书包' depth = 2 start_url = 'https://s.taobao.com/search?q=' + goods infoList = [] for i in range(depth): try: url = start_url + '&s=' + str(44 * i) html = getHTMLText(url) parsePage(infolist, html) except: # 如果某一个页面的解析出了问题,那我们继续下一个页面的解析而不影响整个程序的执行 continue printGoodsList(infoList) main()
股票数据定向爬虫
import requests import re import traceback from bs4 import BeautifulSoup import bs4 def getHTMLText(url): try: r = requests.get(url,timeout = 30) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: return"" fpath = 'D://gupiao.txt' html = getHTMLText('https://hq.gucheng.com/gpdmylb.html') soup = BeautifulSoup(html,'html.parser') a = soup.find_all('a') lst=[] for i in a: try: href = i.attrs['href'] lst.append(re.findall(r"[S][HZ]\d{6}", href)[0]) except: continue lst = [item.lower() for item in lst] #将爬取信息转换小写 count = 0 for stock in lst: url = 'https://gupiao.baidu.com/stock/' + stock + ".html" html = getHTMLText(url) try: if html =="": continue infoDict = {} soup = BeautifulSoup(html,'html.parser') stockInfo = soup.find('div',attrs={'class':'stock-bets'}) if isinstance(stockInfo,bs4.element.Tag): #判断类型 name = stockInfo.find_all(attrs={'class':'bets-name'})[0] infoDict.update({'股票名称':name.text.split()[0]}) keylist = stockInfo.find_all('dt') valuelist = stockInfo.find_all('dd') for i in range(len(keylist)): key = keylist[i].text val = valuelist[i].text infoDict[key] = val with open(fpath,'a',encoding='utf-8') as f: f.write( str(infoDict) + '\n') count = count + 1 print("\r当前速度:{:.2f}%".format(count*100/len(lst)),end="") except: count = count + 1 print("\r当前速度:{:.2f}%".format(count*100/len(lst)),end="") traceback.print_exc() continue
import requests from bs4 import BeautifulSoup import traceback import re def getHTMLText(url): try: r = requests.get(url) r.raise_for_status() r.encoding = r.apparent_encoding #r.apparent_encoding将获得的html页面的文本内容和进行分析,由程序来判断其中的文本可能使#用什么样的编码方式 #r.encoding只是从html的头文件中去解析它可能用到什么样的coding方式 return r.text except: return "" # 获得股票列表 def getStockList(lst, stockURL): html = getHTMLText(stockURL) soup = BeautifulSoup(html, 'html.parser') a = soup.find_all('a') for i in a: try: href = i.attrs['href'] lst.append(re.findall(r"[s][hz]\d{6}", href)[0]) except: continue # 获得每一支个股的股票信息 # 保存所有股票的信息列表,获得股票信息的url网站,将来要把股票存到文件的文件路径 def getStockInfo(lst, stockURL, fpath): for stock in lst: url = stockURL + stock + ".html" html = getHTMLText(url) try: if html == "": # 判断是不是空页面 continue infoDict = {} # 存储当前从页面中返回的所有个股信息 soup = BeautifulSoup(html, 'html.parser') stockInfo = soup.find('div', attrs={'class': 'stock-bets'}) name = stockInfo.find_all(attrs={'class': 'bets-name'})[0] infoDict.update({'股票名称': name.text.split()[0]}) keyList = stockInfo.find_all('dt') valueList = stockInfo.find_all('dd') for i in range(len(keyList)): key = keyList[i].text val = valueList[i].text infoDict[key] = val # 字典中可以直接使用t=value来向字典中新增元素 with open(fpath, 'a', encoding='utf-8') as f: f.write(str(infoDict) + '\n') print('\r当前速度:{:.2f}%'.format(count * 100 / len(lst)), end='') #\r能将打印的字符串的光标提到当前这一行的头部,下一次再进行打印的时候,打印的信息就会覆盖之前打印的内容,从而实现不换行 except: traceback.print_exc() print('\r当前速度:{:.2f}%'.format(count * 100 / len(lst)), end='') continue def main(): stock_list_url = 'https://quote.eastmoney.com/stocklist.html' stock_info_url = 'https://gupiao.baidu.com/stock/' output_file = 'D:/BaiduStockInfo.txt' slist = [] # 存股票信息 getStockList(slist, stock_list_url) getStockInfo(slist, stock_info_url, output_file) main()
requests-beautifulsoup库这样的技术路线,速度都不会提高很快
如果对速度有很高的的要求,用scrapy库