#!/usr/env python #-*- coding: utf-8 -*- import urllib import urllib2 import random import requests import os,sys import MySQLdb from sgmllib import SGMLParser from BeautifulSoup import BeautifulSoup import re num=0 def main(): try: conn=MySQLdb.connect(host='localhost',user='root',passwd='123456',db='googlemarket',charset="utf8") conn.query("set names utf8") except Exception,e: print e sys.exit() cursor=conn.cursor() category=['PERSONALIZATION','TRANSPORTATION','SPORTS','HEALTH_AND_FITNESS','APP_WALLPAPER','COMICS','MEDICAL','BUSINESS','BOOKS_AND_REFERENCE','WEATHER','ENTERTAINMENT','MEDIA_AND_VIDEO','APP_WIDGETS','TOOLS','PHOTOGRAPHY','PRODUCTIVITY','EDUCATION','NEWS_AND_MAGAZINES','TRAVEL_AND_LOCAL','LIFESTYLE','SOCIAL','FINANCE','SHOPPING','LIBRARIES_AND_DEMO','COMMUNICATION','MUSIC_AND_AUDIO','GAME'] for k in range(0,27): t="https://play.google.com/store/apps/category/"+category[k] html=requests.get(t) preresult=html.content soup=BeautifulSoup(preresult) result=soup.prettify("utf-8") pattern=re.compile('<a class="title" href="(.+?)" title') dataresult=re.findall(pattern,result) dataresult=list(set(dataresult)) for i in dataresult: url="https://play.google.com"+i print url #url="https://play.google.com/store/apps/details?id=com.androidesk&hl=zh_CNhttps%3A%2F%2Fplay.google.com%2Fstore%2Fapps%2Fdetails%3Fid%3Dcom.androidesk" html=requests.get(url) preresult=html.content soup=BeautifulSoup(preresult) result=soup.prettify("utf-8") #名称 pattern=re.compile('<div class="document-title" itemprop="name">[\s\S]*?<div>([\s\S]*?)</div>') data0=re.findall(pattern,result) for items in data0: print items #制造商 pattern=re.compile('itemprop="name">([\s\S]*?)</a>') data1=re.findall(pattern,result) make=data1[0].split("\n") print make[8] #版本 pattern=re.compile('itemprop="softwareVersion">([\s\S]*?)</div>') data2=re.findall(pattern,result) print data2[0] #更新时间 pattern=re.compile('itemprop="datePublished">([\s\S]*?)</div>') data3=re.findall(pattern,result) print data3[0] #文件大小 pattern=re.compile('itemprop="fileSize">([\s\S]*?)</div>') data4=re.findall(pattern,result) print data4[0] #支持固件 pattern=re.compile('itemprop="operatingSystems">([\s\S]*?)</div>') data5=re.findall(pattern,result) print data5[0] #说明 pattern=re.compile('itemprop="description">[\s\S]*?<div>([\s\S]*?)</div>') data6=re.findall(pattern,result) for items in data6: print re.sub('[<br /> <p> </p>]',' ',items) sql="insert into address(name,version,developer,pubtime,filesize,support,introduction) values(%s,%s,%s,%s,%s,%s,%s)" for items in data6: if(data5): #values=(data0[0],data1[0],data2[0],data3[0],data4[0],data5[0],re.sub('<br />',' ',items)) #else: #values=(data0[0],data1[0],data2[0],data3[0],data4[0],'NULL',re.sub('<br />',' ',items)) #print values #print sql % values #cursor.execute(sql,values) #conn.commit() pattern=re.compile('<img class="cover-image" src=(.+?) alt="Cover art" itemprop="image" />') data=re.findall(pattern,result) global num for j in data: print j print type(j) headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201'} temp=requests.get(j[1:-2], headers=headers) f=file("googlemarket/"+str(num),"w+") num=num+1 print num f.write(temp.content) if __name__=="__main__": main()
<type 'str'>
Traceback (most recent call last):
File "crawler0729.py", line 103, in <module>
main()
File "crawler0729.py", line 91, in main
temp=requests.get(j[1:-2], headers=headers)
File "/usr/local/lib/python2.7/dist-packages/requests/api.py", line 55, in get
return request('get', url, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/requests/api.py", line 44, in request
return session.request(method=method, url=url, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/requests/sessions.py", line 335, in request
resp = self.send(prep, **send_kwargs)
File "/usr/local/lib/python2.7/dist-packages/requests/sessions.py", line 438, in send
r = adapter.send(request, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/requests/adapters.py", line 327, in send
raise ConnectionError(e)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='lh3.ggpht.com', port=443): Max retries exceeded with url: /RBld17rLw4Ik0JtOaKk4bZB2RiGJ2R8H5Q8Rjw3Hh6BAM694fOzzKj1TJFr7R02ZS_40=w30 (Caused by <class 'socket.error'>: [Errno 101] Network is unreachable)