因为项目需要需要做定向采集企业网站文章
这里是通用采集URL地址(因为不能对每个网站写采集规则这里就是用了通用化) 采集文章方法就不公布了
#!/usr/local/bin/python
#coding=utf8
#网站定向简易采集
#QQ:29295842
import get_post
import re
import re, sys, time
import os.path as osp
#from urlparse import urlparse
from urllib.parse import urlparse
def getUrl(url): #采集
print(url)
open_url=[] #已经采集
d_url=[] #等待采集
i=0
open_url,d_url=getOneUrl(url,open_url,d_url) #采集初始化
while True: #死循环遍历
if(len(d_url)==0):
break #跳出
xx_url=d_url[0]
del d_url[0]
if not d_url in open_url: #print(colour)
open_url,d_url=getOneUrl(xx_url,open_url,d_url) #采集单页
open_url=list(set(open_url))
d_url=list(set(d_url))
print(xx_url,"===",len(open_url),"===",len(d_url))
#d_url.remove(url) #删除元素
def getOneUrl(url,open_url,d_url): #采集单页
parts = urlparse(url) #URL地址拆解
open_url.append(url) #已经采集
get_bool,html=get_post.get_web(url,ua="",timeout=10)
href_arr=[]
if(get_bool):
format = formatURL(clearBlank(html),url) # 格式化html代码
if(format==""):
return open_url,d_url
urls = re.findall(r'''(]*?href="([^"]+)"[^>]*?>)|(]*?href='([^']+)'[^>]*?>)''',format,re.I)
if urls != None : # 取出所有的连接
for regs in urls: # 得到一个单一的url
sUrl = en2chr(regs[1].strip()) #if sUrl.find(parts.netloc):
if re.search('http(.*?)/'+parts.netloc+'/(.*?)',sUrl,re.I):
href_arr.append(sUrl)
if(len(href_arr)>=1): #print(href_arr)
href_arr_x=list(set(href_arr)) #数组去重 #print(href_arr_x)
for colour in href_arr_x: #print(colour)
if not colour in open_url: #print(colour)
d_url.append(colour) #等待采集
# print(open_url)
# print(d_url)
return open_url,d_url
#==================================================================
# 替换实体为正常字符
def en2chr(enStr):
return enStr.replace('&','&')
# 清除html代码里的多余空格
def clearBlank(html):
if len(html) == 0 : return ''
html = re.sub('\r|\n|\t','',html)
while html.find(" ")!=-1 or html.find(' ')!=-1 :
html = html.replace(' ',' ').replace(' ',' ')
return html
# 格式化url
def formatURL(html,url):
try:
urls = re.findall('''(]*?href="([^"]+)"[^>]*?>)|(]*?href='([^']+)'[^>]*?>)''',html,re.I)
if urls == None : return html
for regs in urls :
html = html.replace(regs[0],matchURL(regs[0],url))
return html
except:
return ""
# 格式化单个url
def matchURL(tag,url):
try:
urls = re.findall('''(.*)(src|href)=(.+?)( |/>|>).*|(.*)url\(([^\)]+)\)''',tag,re.I)
if urls == None :
return tag
else :
if urls[0][5] == '':
urlQuote = urls[0][2]
else:
urlQuote = urls[0][5]
if len(urlQuote) > 0 :
cUrl = re.sub('''['"]''','',urlQuote)
else :
return tag
urls = urlparse(url); scheme = urls[0]
if scheme!='' : scheme+='://'
host = urls[1]; host = scheme + host
if len(host)==0 : return tag
path = osp.dirname(urls[2])
if path=='/' : path = '';
if cUrl.find("#")!=-1 : cUrl = cUrl[:cUrl.find("#")]
# 判断类型
if re.search('''^(http|https|ftp):(//|\\\\)(([\w/\\\+\-~`@:%])+\.)+([\w/\\\.\=\?\+\-~`@':!%#]|(&)|&)+''',cUrl,re.I) != None :
# http开头的url类型要跳过
return tag
elif cUrl[:1] == '/' :
# 绝对路径
cUrl = host + cUrl
elif cUrl[:3]=='../' :
# 相对路径
while cUrl[:3]=='../' :
cUrl = cUrl[3:]
if len(path) > 0 :
path = osp.dirname(path)
elif cUrl[:2]=='./' :
cUrl = host + path + cUrl[1:]
elif cUrl.lower()[:7]=='mailto:' or cUrl.lower()[:11]=='javascript:' :
return tag
else :
cUrl = host + path + '/' + cUrl
R = tag.replace(urlQuote,'"' + cUrl + '"')
return R
except:
return ""
#==================================================================
if __name__=="__main__":
#getOneUrl("http://www.dzrpump.com/") #采集初始化
getUrl("http://www.dzrpump.com/") #遍历