import os
import sys
import time
import urllib
from urllib import request
import configparser
import re
import ctypes
import logging
import logging.handlers
import datetime
STD_INPUT_HANDLE = -10
STD_OUTPUT_HANDLE= -11
STD_ERROR_HANDLE = -12
FOREGROUND_BLACK = 0x0
FOREGROUND_BLUE = 0x01
FOREGROUND_GREEN= 0x02
FOREGROUND_RED = 0x04
FOREGROUND_INTENSITY = 0x08
BACKGROUND_BLUE = 0x10
BACKGROUND_GREEN= 0x20
BACKGROUND_RED = 0x40
BACKGROUND_INTENSITY = 0x80
class logger_t:
def __init__(self):
logname=time.strftime('%Y%m%d',time.localtime(time.time()))+".log"
LOG_FILE = logname
handler = logging.handlers.RotatingFileHandler(LOG_FILE, maxBytes = 1024*1024, backupCount = 5)
fmt = '%(asctime)s - %(filename)s:%(lineno)s - %(name)s - %(message)s'
formatter = logging.Formatter(fmt)
handler.setFormatter(formatter)
self.logger = logging.getLogger('tst')
self.logger.addHandler(handler)
self.logger.setLevel(logging.DEBUG)
def info(self, str ):
self.logger.info(str )
def debug(self ,str ):
self.logger.debug( str )
class config:
'''
This Class is used to parser configfile
'''
def __init__(self):
if not os.path.exists("../../config/config.ini" ):
raise IOError()
self.cf=configparser.ConfigParser()
self.cf.read( "../../config/config.ini" )
def __del__(self):
self.cf
def GetValue(self , sec , key ):
return self.cf.get( sec , key)
class webParser:
def __init__(self , config ):
self.http_headers={
'Connection':'keep-alive',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'
}
self.config=config
def __GetHtmlContent( self , url , encod="utf8"):
req = urllib.request.Request( url ,headers= self.http_headers )
fp=urllib.request.urlopen( req ) ;
html=fp.read( )
return html.decode()
def __pickout(self, dict_d ):
s=dict_d['start']
e=dict_d['end']
reg=dict_d['reg']
url=dict_d['url']
html=self.__GetHtmlContent( url )
index1=html.find( s , 0 )
if index1 < 0 :
return ''
data = html[index1:]
index2 = data.find( e )
cnt=data[0: index2+len(e) ]
reg = re.compile(reg)
urllist = reg.findall( cnt )
return urllist
def __createdir(self , dir ):
if not os.path.isdir(dir ):
os.mkdir( dir )
def __download(self ,url , head ):
t1=time.time()
head['Accept']='image/webp,image/*,*/*;q=0.8'
output=self.config.GetValue('Output','dir')
self.__createdir( output )
index=url.rfind("/" )
picname=url[index:]
try:
req = urllib.request.Request( url, headers=head )
webpage = urllib.request.urlopen(req)
except Exception as e1:
print(e1)
return 0
ids=open(output+picname , "wb")
try:
pic_binary_data=webpage.read()
except Exception as e:
print( e )
return 0
size=len( pic_binary_data )
total_size=0
while True:
s1=ids.write( pic_binary_data )
total_size +=s1
if total_size== size:
break
ids.close()
t2=time.time()
print(url+ " Cost "+str( (t2-t1)*1000 ) + " ms!!!size=" + str(size) +" !!!" )
time.sleep(0.1)
return 1
def work(self ):
url=config.GetValue("Input","url")
baseurl_path=urllib.parse.urlparse(url).path
baseurl=url[:url.find( baseurl_path )]
dic1={"start":config.GetValue("PageDown", "start"),
"end":config.GetValue("PageDown", "end") ,
"reg":config.GetValue("PageDown", "reg") ,
"url":config.GetValue("Input","url") }
pic_total_dl=0
ts=time.time()
for i in self.__pickout( dic1):
refer=baseurl+i
head=self.http_headers
head['Referer']=refer
dic={"start":config.GetValue("Element", "start"),
"end":config.GetValue("Element", "end") ,
"reg":config.GetValue("Element", "reg") ,
"url":refer }
piclist = self.__pickout(dic )
for pic in piclist:
if self.__download(pic , head ) :
pic_total_dl=pic_total_dl+1
te=time.time()
print(str(pic_total_dl ) + " pictures has been download!!!,Total Cost "
+ str( (te-ts)*1000 ) + "ms!!!")
if __name__ == '__main__':
config=config()
wb=webParser(config )
wb.work()