#!/usr/bin/python
import threading
from time import ctime,sleep
import pycurl
import urllib2
import sys,os
import StringIO
from lxml import etree
import datetime
import hashlib
starttime = datetime.datetime.now()
def testf():
c = pycurl.Curl()
c.setopt(pycurl.URL, "http://www.weituanpin.com/")
b = StringIO.StringIO()
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.perform()
html = b.getvalue();
print html;
#sleep(500)
#print b.getvalue()
def urllibget(i,j):
response = urllib2.urlopen(i)
html = response.read()
#write_file(html,str(j))
show_pach(html,i)
def show_pach(html,url):
tree=etree.HTML(html)
nodes=tree.xpath(u"/html")
nodes=nodes[0].xpath("//img[@src]")
for n in nodes:
src = n.attrib["src"]
if src.find("http") == -1:
src = url + src
download_img(src)
def write_file(html,file):
fsock = open("file/"+file+".txt", "a")
fsock.write(html)
fsock.close()
print file + " is OK\n"
def download_img(url):
response = urllib2.urlopen(url)
html = response.read()
ms=hashlib.md5();
ms.update(url)
ms.hexdigest()
fk = open("file/nimabi"+ms.hexdigest()+".jpg", "wb")
fk.write(html)
print url
fk.close()
a = ('http://www.sina.com.cn/',
'http://www.sohu.com/',
'http://www.weituanpin.com/',
'http://www.zhihu.com/',
'http://www.xunlei.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.jianshu.com/',
'http://www.sina.com.cn/',
'http://www.sohu.com/',
'http://www.163.com/',
'http://www.zhihu.com/',
'http://www.xunlei.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.jianshu.com/',
'http://www.sina.com.cn/',
'http://www.sohu.com/',
'http://www.163.com/',
'http://www.zhihu.com/',
'http://www.xunlei.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.jianshu.com/',
'http://www.sina.com.cn/',
'http://www.sohu.com/',
'http://www.163.com/',
'http://www.zhihu.com/',
'http://www.xunlei.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.jianshu.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.jianshu.com/',
'http://www.sina.com.cn/',
'http://www.sohu.com/',
'http://www.163.com/',
'http://www.zhihu.com/',
'http://www.xunlei.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.jianshu.com/',
'http://www.sina.com.cn/',
'http://www.sohu.com/',
'http://www.163.com/',
'http://www.zhihu.com/',
'http://www.xunlei.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.jianshu.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.jianshu.com/',
'http://www.sina.com.cn/',
'http://www.sohu.com/',
'http://www.163.com/',
'http://www.zhihu.com/',
'http://www.xunlei.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.jianshu.com/',
'http://www.sina.com.cn/',
'http://www.sohu.com/',
'http://www.163.com/',
'http://www.zhihu.com/',
'http://www.xunlei.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.jianshu.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.jianshu.com/',
'http://www.sina.com.cn/',
'http://www.sohu.com/',
'http://www.163.com/',
'http://www.zhihu.com/',
'http://www.xunlei.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.jianshu.com/',
'http://www.sina.com.cn/',
'http://www.sohu.com/',
'http://www.163.com/',
'http://www.zhihu.com/',
'http://www.xunlei.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.jianshu.com/',
'http://www.jianshu.com/',
'http://www.xiachufang.com/',
'http://www.163.com/',
'http://www.gelonghui.com/',
'http://www.jianshu.com/',
'http://www.jianshu.com/',
'http://www.xiachufang.com/',
'http://www.163.com/')
c = ("http://2mn.tv","http://2mn.tv")
j = 0
for i in c:
j = j + 1
t = threading.Thread(target=urllibget,args=(i,j,))
t.start()
if j == 3:
break
#https pycurl 防止脚本超时
def testf_https():
c = pycurl.Curl()
c.setopt(pycurl.URL, "https://detail.1688.com/offer/528970869962.html?spm=a312h.7841636.1998813769.d_pic_14.Cm06wt&tracelog=p4p")
b = StringIO.StringIO()
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.SSL_VERIFYPEER, 0)
c.setopt(pycurl.SSL_VERIFYHOST, 0)
c.perform()
html = b.getvalue()
#print html
show_pach_https( html,"https://detail.1688.com/offer/528970869962.html?spm=a312h.7841636.1998813769.d_pic_14.Cm06wt&tracelog=p4p")
def show_pach_https(html,url):
tree=etree.HTML(html)
nodes=tree.xpath(u"/html/body")
nodes=nodes[0].xpath("//img")
for n in nodes:
src = n.attrib["src"]
if src.find("http") == -1:
src = "http:"+src
else:
src = src
print src
download_img_https(src)
def download_img_https(url):
c = pycurl.Curl()
c.setopt(pycurl.URL, url)
b = StringIO.StringIO()
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.SSL_VERIFYPEER, 0)
c.setopt(pycurl.SSL_VERIFYHOST, 0)
c.perform()
html = b.getvalue()
#print html
ms=hashlib.md5()
ms.update(url)
ms.hexdigest()
fk = open("file/nimabi"+ms.hexdigest()+".jpg", "wb")
fk.write(html)
print url
fk.close()
endtime = datetime.datetime.now()
print (endtime - starttime).seconds