#!/usr/bin/env python
#encoding=utf-8
import select,socket,codecs,doctest,time,datetime,os
def read_urls():
urls=[]
prefix="http://book.360buy.com/%s.html"
for idx,line in enumerate(codecs.open("./book/1.csv","r","utf-8").readlines()):
if idx==0:continue
if idx>=100:break
if line.find(",")==-1:continue
#print line
wid,name=line.rstrip().split(",",1)
urls.append(prefix%wid)
return urls
def _parser(url):
"""
>>> _parser("http://book.360buy.com/123.html")
('book.360buy.com','/123.html')
"""
a,b=url[7:].split("/",1)
return (a,"/"+b)
def fetch(url):
hostname,path=_parser(url)
s=socket.socket(socket.AF_INET,socket.SOCK_STREAM)
#print "==>",s.fileno()
#time.sleep(10)
#addr=socket.gethostbyname(hostname)
#print addr
s.connect((hostname,80))
html="""GET %s HTTP/1.0\r\n"""%path
html+="""Host: %s\r\n"""%hostname
html+="""User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64; rv:17.0) Gecko/20100101 Firefox/17.0\r\n"""
html+="""Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n"""
html+="""Accept-Language: zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3\r\n"""
html+="""Cookie: BAIDUID=4782C3288E4A1689E0F8CBC0DF82BB1D:FG=1; BDUT=sc2x4782C3288E4A1689E0F8CBC0DF82BB1D13bda69e4000; H_PS_PSSID=1428_1667_1662\r\n"""
html+="""Cache-Control: max-age=0\r\n"""
html+="""\r\n"""
f=None
s.sendall(html)
return s
#rlist,wlist,elist=select.select([],[],[])
def async_down(urls):
sockets=[]
dict={}
files={}
for url in urls:
socket=fetch(url)
dict[socket.fileno()]=url
sockets.append(socket)
#print sockets.__len__()
#time.sleep(10)
start=datetime.datetime.now()
end=datetime.datetime.now()
while sockets and (end-start).seconds<12:
rlist,wlist,elist=select.select(sockets,[],[])
#print "ready rlist:%s"%rlist.__len__()
for s in rlist:
data=s.recv(40960)
if data:
#continue
f=None
if s.fileno() not in files:
f=codecs.open("./results/%s.html"%s.fileno(),"w","utf-8")
files[s.fileno()]=f
f=files[s.fileno()]
f.write(data.decode("gbk","ignore"))
f.flush()
else:
sockets.remove(s)
end=datetime.datetime.now()
filenos=[socket.fileno() for socket in sockets]
for fileno,f in files.iteritems():
f.close()
if fileno in filenos:
os.remove("./results/%s.html"%fileno)
print "left socket %s"%sockets.__len__()
if __name__=="__main__":
start=datetime.datetime.now()
#doctest.testmod()
urls=read_urls()
#print urls
async_down(urls)
end=datetime.datetime.now()
print (end-start).seconds