发起一个开源项目http://www.abelkhan.com/
目前而言,已经用python编写了一个网络爬虫抓取页面,和一个简单的前端
网络爬虫,已经有很多高手写过,我基本上奉行了拿来主义,
得益于python完善的lib,这个网络爬虫实现起来非常的简单:
使用urllib2从对应的url地址抓取html
def get_page(url): try: headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebkit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.10240', 'Connection':'Keep-Alive', 'Accept':'text/html, application/xhtml+xml, p_w_picpath/jxr, */*', 'Accept-Language':'zh-Hans-CN,zh-Hans;q=0.8,en-US;q=0.5,en;q=0.3', } cookie_jar = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie_jar)) req = urllib2.Request(url = url, headers = headers) response = opener.open(req, timeout = 5) the_page = response.read() headers = response.info() return the_page, headers except: import traceback traceback.print_exc()
一个需要注意的地方是,有部分网站会限制爬虫访问,所以我加入了headers用于模拟浏览器访问。
这个方法差强人意,但是我也没有找到一个更完善的办法。
抓取到页面后,基于HTMLParser做了html的解析:
class htmlprocess(HTMLParser.HTMLParser): def __init__(self, urlinfo): HTMLParser.HTMLParser.__init__(self) self.urllist = {} self.sub_url = "" self.urlinfo = urlinfo self.current_url = urlinfo['url'] keywords = doclex.simplesplit(self.current_url) for key in keywords: if key != "com" and key != "www" and key != "cn": self.urlinfo['keys']['1'].append(key) self.current_tag = "" self.style = "" def handle_starttag(self, tag, attrs): self.current_tag = tag self.style = 'None' self.sub_url = "" if tag == 'meta': for name,value in attrs: if name == 'name': if value == 'keywords' or value == 'metaKeywords': self.style = 'keywords' elif value == 'description' or value == 'metaDescription': self.style = 'profile' for name,value in attrs: if name == 'content': if self.style == 'keywords': keywords = doclex.simplesplit(value) if isinstance(keywords, list): for key in keywords: self.urlinfo['keys']['1'].append(key) elif self.style == 'profile': self.urlinfo['profile']['0'] = value encodingdate = chardet.detect(value) if encodingdate['encoding']: udata = unicode(value, encodingdate['encoding']) tlen = 16 if len(udata) < 16: tlen = len(udata) self.urlinfo['titlegen'].append(udata[0:tlen].encode('utf-8')) else: self.urlinfo['titlegen'].append(value) if tag == 'a' or tag == 'A' or tag == 'link': self.sub_url = "" for name,value in attrs: if name == 'href': if len(value) == 0: return if not judged_url(value): if self.current_url[len(self.current_url) - 1] != '/' and value[0] != '/': value = self.current_url + '/' + value else: value = self.current_url + value if value.find('javascript') != -1: return if value.find('javaScript') != -1: return if self.current_url.find("apple") != -1: if value.find("http://www.apple.com/cn/mac#ac-gn-menustate") !=-1: return if self.current_url.find("cnblogs") != -1: if value.find("http://msg.cnblogs.com/send?recipient=itwriter") != -1: return elif value.find("http://i.cnblogs.com/EditPosts.aspx?opt=1") != -1: return elif value.find("http://i.cnblogs.com/EditPosts.aspx?postid=1935371") != -1: return elif value.find("http://msg.cnblogs.com/send?recipient=itwriter/") != -1: return elif value.find("http://msg.cnblogs.com/send?recipient=itwriter/GetUsername.aspx") != -1: return elif value.find("/EnterMyBlog.aspx?NewArticle=1") != -1: return elif value.find("GetUsername") != -1: return elif value.find("GetMyPassword") != -1: return elif value.find("http://i.cnblogs.com/EditPosts.aspx?postid=") != -1: return elif value[len(value) - 1] == '#': value = value[0:-1] if self.current_url.find(value) != -1: return if value[len(value) - 1] == '#': value = value[0:-1] if value != self.current_url and len(value) < 64 and not ingoreurl(value): self.urllist[value] = {'url':value, 'keys':{'1':[], '2':[], '3':[]}, 'title':'', 'titlegen':[], 'profile':{'0':'', '1':'', '2':[]}} self.sub_url = value print value def handle_data(self, data): if self.current_tag == 'title': try: data = doclex.delspace(data) keys = doclex.lex(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: self.urlinfo['keys']['2'].append(key) if len(data) > 0: self.urlinfo['title'] = data except: import traceback traceback.print_exc() elif self.current_tag == 'a': try: if self.sub_url != "": keys = doclex.simplesplit(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: if key in self.urllist[self.sub_url]['keys']['3']: self.urllist[self.sub_url]['keys']['3'].remove(key) if key not in self.urllist[self.sub_url]['keys']['1'] and key not in self.urllist[self.sub_url]['keys']['2']: self.urllist[self.sub_url]['keys']['2'].append(key) encodingdate = chardet.detect(data) if encodingdate['encoding']: udata = unicode(data, encodingdate['encoding']) tlen = 16 if len(udata) < 16: tlen = len(udata) self.urllist[self.sub_url]['titlegen'].append(udata[0:tlen].encode('utf-8')) if len(udata) > 16: self.urllist[self.sub_url]['profile']['1'] = udata[0:32].encode('utf-8') except: import traceback traceback.print_exc() else: try: if not doclex.invialddata(data): data = doclex.delspace(data) encodingdate = chardet.detect(data) udata = unicode(data, encodingdate['encoding']) tlen = 16 if len(udata) < 16: tlen = len(udata) self.urlinfo['titlegen'].append(udata[0:tlen].encode('utf-8')) if len(udata) > 32: self.urlinfo['profile']['2'].append((udata[0:32] + u"...").encode('utf-8')) keys1 = doclex.lex(data) for key in keys1: self.urlinfo['keys']['3'].append(key) except: import traceback traceback.print_exc()
基本上,要说的就是HTMLParser使用方法见文档,HTMLParser预先了定义了一组虚接口handle_starttag,handle_data和handle_endtag,使用者通过重载这三个接口,来实现对html中的tag进行处理,进而完整的解析抓取到的html。
然后从搜索结果来看,搜索的质量还很不尽如人意,欢迎大家的参与和提出意见
项目地址:http://www.abelkhan.com/
向我们提出意见:http://www.abelkhan.com/guestbook/
对项目进行捐助:http://www.abelkhan.com/collection/
代码托管地址如下:https://github.com/qianqians/websearch欢迎大家参与