学习构建一个简易的搜索引擎,步骤如下:
网页爬取过程:
首先输入一个初始的网页,获取该网页的内容并进行解析,建立“网页——词——位置”的关系,存入wordlocation数据表中。随后从网页中寻找链接,若有链接,则将链接的相对路径转换成绝对路径,并存储,作为下一层检索的URL。同时将“原网页URL——链接URL——链接文本”关系保存到link表中。
至此,一层爬取完毕。根据新加入的URL,爬取下一层网页。
查询过程:
用户输入一个查询字符串,将其分词,据此构建SQL查询语句,查询一个网页内同时含有该字符串中所有词的网页地址。并记录。
网页排名:
1. 单词频度:即要查询单词在网页中出现的次数,次数越多,分值越高。
2. 文档位置:单词在文档中出现的位置越靠近文档的开始处,分值越高。
3. 单词距离:要查询的多个关键词在网页中的距离越近越好。
4. 外部链接:统计指向某网页的链接的数量,数量越多越好。
5. PageRank值:关于计算方法详见代码,很简单。此处要说明的是,PageRank值不需要再用户要查询时才开始生成,而是在网页爬取完毕,数据库建立好之后,就可以计算每个网页的PageRank值并存储,定期对PageRank值进行更新即可。
计算方法:对所有网页的PageRank设置一个任意的初始值,反复计算,迭代若干次。在每次迭代期间,每个网页的PageRank值会越来越接近真实值。
汇总排名
根据个性化的搜索引擎需求,汇总上述各种方法的打分结果,注意打分结果在汇总时最好进行归一化,以消除不同度量带来的影响。汇总时为每种度量方法设置权值,计算不同度量方法的加权和。
详细代码如下:
# -*- coding: utf-8 -*-
__author__ = 'Bai Chenjia'
import urllib2
# import BeautifulSoup
from bs4 import BeautifulSoup
from urlparse import urljoin
import sys
from sqlite3 import dbapi2 as sqlite
import re
import collections
reload(sys)
sys.setdefaultencoding('utf8')
ignorewords = set(['the', 'of', 'to', 'and', 'a', 'in', 'is', 'it'])
class crawler:
# 初始化crawler类并传入数据库名称
def __init__(self, dbname):
self.con = sqlite.connect(dbname)
def __del__(self):
self.con.close()
def dbcommit(self):
self.con.commit()
# 辅助函数,用于获取条目的id,并且如果条目不存在,就将其加入数据库中
# table为表名,field为字段名,value为要查询的字段
def getentryid(self, table, field, value, createnew=True):
cur = self.con.execute("select rowid from %s where %s='%s'"
% (table, field, value))
# 数据库操作,获取单条查询结果
res = cur.fetchone()
# 如果res为空,表明查询到的结果为空。则将该value插入到数据表中
if res is None:
cur = self.con.execute("insert into %s (%s) values ('%s')"
% (table, field, value))
return cur.lastrowid
# 如果查询到了相关记录
else:
return res[0]
# 为每个网页建立索引,参数为url和该url对应的soup
# 首先利用gettextonly和separatewords获取每个单词,然后将每个单词与网页建立关联
# 存储单词在文档中出现的位置
def addtoindex(self, url, soup):
if self.isindexed(url):
return
print "Indexing " + url
# 获取每个单词
text = self.gettextonly(soup)
words = self.separatewords(text)
# 得到url对应的ID
urlid = self.getentryid('urllist', 'url', url)
# 将每个单词与该url关联
for i in range(len(words)):
word = words[i]
if word in ignorewords:
# 去除停用词
continue
# 获取word的id,如果没有则新建
wordid = self.getentryid('wordlist', 'word', word)
# 为该单词建立 urlid-wordid-location的数据项,插入wordlocation数据表中
self.con.execute("insert into wordlocation(urlid,wordid,location) \
values (%d,%d,%d)" % (urlid, wordid, i))
# 从一个HTML网页中提取文字(不带标签)返回一个长字符串
def gettextonly(self, soup):
v = soup.string
if v is None:
c = soup.contents
resulttext = ''
for t in c:
# 递归向下查找
subtext = self.gettextonly(t)
resulttext += subtext + '\n'
return resulttext
else:
return v.strip()
# 根据任何非空白字符进行分词处理
# 将gettextonly产生的字符串拆分成一组独立的单词,以便将其加入索引中
def separatewords(self, text):
# 正则表达式,以非单词字符作为分隔符
splitter = re.compile('\\W+')
# 分割text文本为单词,返回单词列表
return [s.lower() for s in splitter.split(text) if s != '']
# 如果url已经建立索引,则返回true
def isindexed(self, url):
# 查找该条url记录并返回第一条,如果不存在则返回None
u = self.con.execute("select rowid from urllist where url='%s'"
% url).fetchone()
if u is not None:
# 检查它是否已经被检索过了
v = self.con.execute("select * from wordlocation where urlid=%d"
% u[0]).fetchone()
if v is not None:
return True
return False
# 添加一个关联两个网页的链接
def addlinkref(self, urlFrom, urlTo, linkText):
words = self.separatewords(linkText)
fromid = self.getentryid('urllist', 'url', urlFrom)
toid = self.getentryid('urllist', 'url', urlTo)
if fromid == toid:
return
cur = self.con.execute(
"insert into link(fromid,toid) values (%d,%d)" % (fromid, toid))
linkid = cur.lastrowid
for word in words:
if word in ignorewords:
continue
wordid = self.getentryid('wordlist', 'word', word)
self.con.execute(
"insert into linkwords(linkid,wordid) values (%d,%d)" % (linkid, wordid))
# 从一小组网页开始进行广度优先搜索,直至某一给定深度depth
# 期间为网页简历索引
def crawl(self, pages, depth=2):
for i in range(depth):
# 新建newpages set表明不能有重复
newpages = set()
for page in pages:
print "page:", page
try:
c = urllib2.urlopen(page)
except:
print "找不到网页 %s" % page
continue
soup = BeautifulSoup(c.read(), "html5lib")
self.addtoindex(page, soup)
# 从解析后的网页中查找网页中含有的链接标签
links = soup('a')
for link in links:
# 如果含有指向其他网址的链接,则提取该网址并建立索引
if 'href' in dict(link.attrs):
# 此处讲链接的相对路径转为绝对路径
url = urljoin(page, link['href'])
# print "url:", url
if url.find("'") != -1:
continue
url = url.split('#')[0] # 去掉位置部分
if url[0:4] == 'http' and not self.isindexed(url):
newpages.add(url)
linkText = self.gettextonly(link)
self.addlinkref(page, url, linkText)
# 更新数据库
self.dbcommit()
# 更新当前要检索的链接pages
pages = newpages
print "end.."
# 创建数据表
def createindextables(self):
self.con.execute('create table urllist(url)')
self.con.execute('create table wordlist(word)')
self.con.execute('create table wordlocation(urlid,wordid,location)')
self.con.execute('create table link(fromid integer,toid integer)')
self.con.execute('create table linkwords(wordid,linkid)')
self.con.execute('create index wordidx on wordlist(word)')
self.con.execute('create index urlidx on urllist(url)')
self.con.execute('create index wordurlidx on wordlocation(wordid)')
self.con.execute('create index urltoidx on link(toid)')
self.con.execute('create index urlfromidx on link(fromid)')
self.dbcommit()
# 创建pagerank数据表,计算每一个网页的pagerank值,将值存储在表中
def calculatepagerank(self, iteration=20):
# 更新之前先清除之前的pagerank表,包含两项urlid和score,其中urlid为主键
self.con.execute('drop table if exists pagerank')
self.con.execute('create table pagerank(urlid primary key, score)')
# 初始化每个url的pagerank值为1
self.con.execute('insert into pagerank select rowid, 1.0 from urllist')
self.dbcommit()
print "更新pagerank表..."
for i in range(iteration):
print "iteration ", i
for (urlid,) in self.con.execute('select rowid from urllist'):
pr = 0.15
# 循环遍历指向当前网页的所有网页
for (linker,) in self.con.execute(
'select distinct fromid from link where toid=%d' % urlid):
# 得到链接源对应网页的url值
linkingpr = self.con.execute(
'select score from pagerank where urlid=%d' % linker).fetchone()[0]
# 根据链接源,求总的链接数
linkingcount = self.con.execute(
'select count(*) from link where fromid=%d' % linker).fetchone()[0]
pr += 0.85 * (linkingpr / linkingcount)
# 更新数据表,修改pagerank值
self.con.execute(
'update pagerank set score=%f where urlid=%d' % (pr, urlid))
self.dbcommit()
"""
上述crawler类用于爬取数据和生成数据库
下面的searcher类用于搜索
"""
class searcher:
def __init__(self, dbname):
self.con = sqlite.connect(dbname)
def __del__(self):
self.con.close()
# 联合查询数据库。根据要检索内容q所分成的单词,查询数据库,寻找同时含有q中所有单词的网页
# 检索 “footgame时”,查询语句为
# select w0.urlid,w0.location,w1.location from
# wordlocation w0,wordlocation w1 where w0.wordid=234
# and w0.urlid=w1.urlid and w1.wordid=1432
def getmatchrows(self, q):
# Strings to build the query
fieldlist = 'w0.urlid'
tablelist = ''
clauselist = ''
wordids = []
# 分割单词
words = q.split(' ')
tablenumber = 0
# 构造数据库联合查询的语句
for word in words:
# 获取单词的ID号
wordrow = self.con.execute(
"select rowid from wordlist where word='%s'" % word).fetchone()
if wordrow is not None:
wordid = wordrow[0]
wordids.append(wordid)
if tablenumber > 0:
tablelist += ','
clauselist += ' and '
clauselist += 'w%d.urlid=w%d.urlid and ' % (
tablenumber - 1, tablenumber)
fieldlist += ',w%d.location' % tablenumber
tablelist += 'wordlocation w%d' % tablenumber
clauselist += 'w%d.wordid=%d' % (tablenumber, wordid)
tablenumber += 1
# 查询,保存结果,row中存储的是urlid, wordid, location
if clauselist != '':
fullquery = 'select %s from %s where %s' % (
fieldlist, tablelist, clauselist)
print fullquery
cur = self.con.execute(fullquery)
rows = [row for row in cur]
# 返回值rows中存储 urlid,wordid,location ,wordids中存储要检索的word的编号
return rows, wordids
else:
print "not found"
# getmatchrows的返回值rows中存储 urlid,location列表 ,wordids中存储要检索的word的编号
def getscoredlist(self, rows, wordids):
# 初始化(url,score)字典,值都置为0
totalscores = dict([(row[0], 0) for row in rows])
# 设置不同打分标准的权重,根据权重打分
weights = [(1.0, self.locationscore(rows)),
(1.0, self.frequencyscore(rows)),
(2.0, self.pagerankscore(rows)),
(1.0, self.distancescore(rows)),
(1.0, self.inboundlinkscore(rows)),
(2.0, self.linktextscore(rows, wordids)),
(5.0, self.nnscore(rows, wordids))]
# 其中wight是一个数值,scores是一个字典,存储着所有rows在该项的分值
for weight, scores in weights:
if scores is None:
print "weight = ", weight
print scores is None
for url in totalscores:
try:
totalscores[url] += weight * scores[url]
except:
pass
# print "url=",url,"scores=",len(scores.items())
return totalscores
# 根据 rowid 从urllist 中获取 url
def geturlname(self, urlid):
res = self.con.execute(
"select url from urllist where rowid = %d" % urlid).fetchone()[0]
return res
# 该函数用于获取最终检索结果
# 首先 getmatchrows 检索,随后 getscoredlist 进行评分,排序,最后取评分前10项作为结果,获取其真实URL输出
def query(self, q):
rows, wordids = self.getmatchrows(q)
scores = self.getscoredlist(rows, wordids)
rankedscores = sorted([(score, url) for (url, score) in
scores.items()], key=lambda x: x[0], reverse=True)
for score, urlid in rankedscores[0:10]:
print score, self.geturlname(urlid)
# 有的评价方法中分值越大越好,有的评价方法中则分值越小越好,因此需要归一化处理,处理后的值域为0-1
def normalizescores(self, scores, smallIsBetter=0):
vsmall = 0.00001
if smallIsBetter:
# score.values取字典的值,返回列表
minscore = min(scores.values())
res = []
for u, l in scores.items():
temp = float(minscore) / max(vsmall, l)
res.append((u, temp))
return dict(res)
else:
maxscore = max(max(scores.values()), vsmall)
res = []
for u, c in scores.items():
temp = float(c) / maxscore
res.append((u, temp))
return dict(res)
# 基于文档中检索词的位置打分,文档的主题有可能出现在文档的开始处
# 对单词在文档中越早出现给予越高的评价
def locationscore(self, rows):
if len(rows) == 0:
return collections.defaultdict(int)
# 设置默认字典
locations = collections.defaultdict(int)
for row in rows:
# 累加所有单词的位置和
loc = sum(row[1:])
locations[row[0]] = loc
return self.normalizescores(locations, smallIsBetter=1)
# 基于词频给文档打分
def frequencyscore(self, rows):
if len(rows) == 0:
# 返回默认值,任意键的值都为0
return collections.defaultdict(int)
# 设定初值
counts = dict([(row[0], 0) for row in rows])
# 累加计算
for row in rows:
counts[row[0]] += 1
# 归一化
return self.normalizescores(counts, smallIsBetter=0)
# 要查询的多个单词在同一网页中的位置距离越近得分越高
def distancescore(self, rows):
if len(rows) == 0:
return collections.defaultdict(int)
dis = collections.defaultdict(int)
for row in rows:
# 计算该网页中检索单词的距离远近
temp = sum([abs(row[i] - row[i - 1]) for i in range(2, len(row))])
dis[row[0]] = temp
# print "dis = ", dis.items()[:]
return self.normalizescores(dis, smallIsBetter=1)
# 每个网页的重要性由指向该网页的其他网页的链接数量之和决定
# 其中每个外部回指链接拥有同样的权重
def inboundlinkscore(self, rows):
if len(rows) == 0:
return collections.defaultdict(int)
uniqueurls = set([row[0] for row in rows])
bound_list = [(u, self.con.execute(
'select count(*) from link where toid=%d' % u).fetchone()[0]) for u in uniqueurls]
bound_dict = collections.defaultdict(int)
for u1, v in bound_list:
bound_dict[u1] = v
return self.normalizescores(bound_dict, smallIsBetter=0)
# pagerank算法给文档打分,直接从数据库中提取pagerank表中的数据后作归一化即可
def pagerankscore(self, rows):
if len(rows) == 0:
return collections.defaultdict(int)
pagescore = [(row[0], self.con.execute(
'select score from pagerank where urlid=%d' % row[0]).fetchone()[0]) for row in rows]
pagerank_dict = dict(pagescore)
return self.normalizescores(pagerank_dict, smallIsBetter=0)
# linktextscore算法 copy
def linktextscore(self, rows, wordids):
linkscores = dict([(row[0], 0) for row in rows])
for wordid in wordids:
cur = self.con.execute(
'select link.fromid,link.toid from linkwords,link where wordid=%d and linkwords.linkid=link.rowid' % wordid)
for (fromid, toid) in cur:
if toid in linkscores.keys():
pr = self.con.execute('select score from pagerank where urlid=%d' % fromid).fetchone()[0]
linkscores[toid] += pr
maxscore = max(linkscores.values())
normalizedscores = dict([(u, float(l) / maxscore) for (u, l) in linkscores.items()])
return normalizedscores
# 基于神经网络
def nnscore(self, rows, wordids):
return collections.defaultdict(int)
if __name__ == "__main__":
# 制定网页运行爬虫
# pagelist = ["http://www.bbc.com/sport/football/35622621"]
# new_crawler = crawler(dbname='')
# new_crawler.crawl(pagelist)
# 该条语句只可以运行一次,创建出具库
# newcrawler.createindextables()
# 建立数据表
# newcrawler = crawler('searchindex.db')
# pages = ['http://www.bbc.com/sport/football/35622621']
# newcrawler.crawl(pages)
# 更新pagerank表
#newcrawler = crawler('searchindex.db')
# newcrawler.calculatepagerank()
# 测试数据库,输出pagerank排名前几的网页和值
#newcrawler = crawler('searchindex.db')
#a = newcrawler.con.execute('select * from pagerank order by score desc').fetchall()[0:10]
#e = searcher('searchindex.db')
# for i in range(10):
# print e.geturlname(a[i][0]), "score=", a[i][1]
# e = searcher('searchindex.db')
# result = e.getmatchrows('football game')
# print "result = ", result[: ]
# 调用query函数,根据权值输出最终的页面排序结果
e = searcher('searchindex.db')
e.query('football game')
# test function:normalizescores
#testscore = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5}
#e = searcher('searchindex.db')
#res = e.normalizescores(testscore, 0)
# print res