北大天网搜索引擎TSE分析及完全注释[1]寻找搜索引擎入口

本着黑客精神我将陆续把最近分析注释TSE搜索引擎的心得发布出来,老鸟,大虾,大牛,高手飘过就是了,若愿意浪费指点下小弟的在下不甚感激,有问题的朋友直接留言讨论。由于本人水平有限,分析和翻译难免有错大家见笑了。

上学期拜读了James F.Kurose著的《计算机网络-自顶向下方法与internet特色(第三版阴影)》,觉得写得确实不错(希望没看的朋友一定要买来看看),自己也来搞个高自顶向下的学习方法,先从用户看得到的东西出发分析研究搜索引擎,下面我们就来看看各大搜索引擎搜索界面的代码,你所需要特别注意的是form表单中的action

雅虎http://www.yohoo.com/:

< form  name =s1  style ="margin-bottom:0"  action ="http://search.yahoo.com/search" >
< table  cellpadding =0  cellspacing =0  border =0 >< tr >< td >
< input  type =text  size =30  name =p  title ="enter search terms here" > &nbsp;
< input  type =submit  value =Search > &nbsp;&nbsp; </ td >< td >< font  face =arial  size =-2 > &#183; &nbsp;
< href ="http://search.yahoo.com/search/options?p=" > advanced search </ a >< br > &#183; &nbsp;
< href ="http://buzz.yahoo.com/" > most popular </ a ></ font ></ td ></ tr ></ table ></ form >

谷歌http://www.g.cn:

< form  method =GET  action =/search >< tr >< td  nowrap >
< font  size =-1 >< input  type =text  name =q  size =41  maxlength =2048  value ="jrckkyy"  title ="Google 搜索" >   < input  type =submit  name =btnG  value ="Google 搜索" >< input  type =hidden  name =complete  value =1 >< input  type =hidden  name =hl  value ="zh-CN" >< input  type =hidden  name =newwindow  value =1 >< input  type =hidden  name =sa  value ="2" ></ font ></ td ></ tr ></ form >

百度http://www.baidu.com:

< form  name =f2  action ="/s" >
< tr  valign ="middle" >
< td  nowrap >
< input  type =hidden  name =ct  value ="0" >
< input  type =hidden  name =ie  value ="gb2312" >
< input  type =hidden  name =bs  value ="jrckkyy" >
< input  type =hidden  name =sr >
< input  type =hidden  name =z  value ="" >
< input  type =hidden  name =cl  value =3 >
< input  type =hidden  name =f  value =8 >
< input  name =wd  size ="35"  class =i  value ="jrckkyy"  maxlength =100 >
< input  type =submit  value =百度一下 >   < input  type =button  value =结果中找  onclick ="return bq(f2,1,0);" > &nbsp;&nbsp;&nbsp; </ td >
< td  nowrap >< href ="http://utility.baidu.com/quality/quality_form.php?word=jrckkyy" > 与百度对话 </ a ></ td >
</ tr >
</ form >

天网http://www.tianwang.com/:

< form  name =f  action ="/cgi-bin/tw"  method =get >
                
< td  valign =center  width =634  background =images/index_image_02.gif >
                    
< table  height =46  cellspacing =0  cellpadding =0  width =600  align =right   border =0 >
                        
< tbody >
                            
< tr >  
                                
< td  height =50 >
                                    
< table  cellspacing =0  cellpadding =0  width =600  border =0 >
                                        
< tbody >
                                            
< tr >
                                  
< td  width ="524"  height ="30"  valign ="bottom" >
                                        
< div  align ="center" >                                    < input  name ="word"  type ="text"  size ="40"  maxlength ="255"  onClick ="this.focus();checkWord(this,1)"  onblutesr ='checkWord(this,0)'  value ='请输入资源名称' >  
                                            
< font  color =#ffffff >   &nbsp;  
                                            
< select  onChange =reRange(this.selectedIndex)  name =range >
                                                
< script  language =javascript >
                           
<!--
                           
for(var i = 0; i < rescode.length; i++{
                               
if(i == 0{
                                   document.write(
'<option value="0" selected>' + rescode[i][0+ '</option>');
                               }
 else {
                                   document.write(
'<option value="' + i + '">' + rescode[i][0+ '</option>');
                               }

                           }

                           document.f.range.selectedIndex 
= 0;
                           
-->
                         
</ script >
                                            
</ select >
                                            
</ font > - < font  color =#ffffff >  
                                            
< select  name =cd >
                                                
< script  language =javascript >
                           
<!--
                           
var ind = document.f.range.selectedIndex;
                           
var len = (rescode[ind].length - 1/ 2;
                           
var sel = 0;
                           
for(var i = 0; i < len; i++{
                               document.write(
'<option value="' + rescode[ind][2*i+1+ '">' + rescode[ind][2*i+2+ '</option>');
                               
if(rescode[ind][2*i+1== 0)
                                   sel 
= i;
                           }

                           document.f.cd.selectedIndex 
= sel;
                           
-->
                 
</ script >
                                            
</ select >
                                            
</ font ></ div >
                                    
</ td >
                
< td  width ="71"  valign ="bottom" >< input  id =submit2  type =image  height =22  width =40  src ="images/so2.gif"  align =absMiddle  name =submit ></ td >
              
</ tr >
                                            
< tr >
                                                
< td  colspan =3  height =25  class =style16 >
                                                    
< div  align =center ></ div >
                                                
</ td >
                                            
</ tr >
                                        
</ tbody >
                                    
</ table >
                                
</ td >
                            
</ tr >
                        
</ tbody >
                    
</ table >
                
</ td >
            
</ form >

测试服务器TSE:

< form  method ="get"  action ="/cgi-bin/index/TSESearch"  name ="tw" >
        
< td  width ="100%"  height ="25"  align ="center" >                            
        
< input  type ="text"  name ="word"  size ="55" >
        
< input  type ="submit"  value =" 搜索"  name ="www" >
        
</ td >                            
        
< input  type ="hidden"  name ="cdtype"  value ="GB" >                          
        
</ form >     

由以上几个form的属性可以看出全部采用的是get方法,CGI做为处理程序,也就是C/C++,CGI全称是“公共网关界面”(Common Gateway Interface),HTTP服务器与你的或其它机器上的程序进行“交谈”的一种工具,其程序须运行在网络服务器上。CGI逐渐被近几年来的PHP,JAVA,ASP,PERL,Python,Ruby等动态语言所取代。但是其在速度和运行效率上的优势是无法取代的。

以下是TSE CGI入口程序注释,其他搜索引擎的入口也应该类似

 

/**
 * 程序翻译说明
 * @Copyright (c) 2008, 研发部
 * All rights reserved.
 *
 * @filesource  TSESearch.cpp
 * @author  jrckkyy <[email protected]>
 *
 * Let's start
 *
 
*/

#include 
< stdio.h >
#include 
< stdlib.h >
#include 
< string .h >
#include 
< sys / types.h >
#include 
< sys / stat.h >
#include 
< fcntl.h >
#include 
< sys / time.h >
#include 
< unistd.h >

#include 
< iostream >
#include 
< fstream >
#include 
< list >

#include 
" Comm.h "      // 包含2个索引和1个数据文件
#include  " Query.h "      // 包含数据查询处理头文件
#include  " Document.h "      // html文档处理头文件
#include  " StrFun.h "          // 字符串处理头文件
#include  " ChSeg/Dict.h "      // 字元字典处理头文件
#include  " ChSeg/HzSeg.h "     
#include 
" DisplayRst.h "      // 返回查询结果页面头文件,返回结果分为头部,中部,底部

using   namespace  std;

/*
 * A inverted file(INF) includes a term-index file & a inverted-lists file.
 * A inverted-lists consists of many bucks(posting lists).
 * The term-index file is stored at vecTerm, and
 * the inverted-lists is sored at mapBuckets.
 
*/


/**
 * 程序翻译说明
 * 搜索程序入口前台关键字提交到该cgi程序 例如:./cgi-bin/index/TSESearch?word=123&start=1
 * 倒排文件包括一个记录检索词文件和一个倒排列表文件。
 * 倒排列表包含很多标志(提交名单)。
 * 记录检索词文件使用vecTerm来排序,和倒排列表是用mapBuckets来排序。
 *
 * @access  public
 * @param   int char 参数的汉字说明 用于接收前台get传递的参数
 * @return  string 0
 
*/

int  main( int  argc,  char *  argv[])
{
    
struct timeval begin_tv, end_tv;
    
struct timezone tz;

    CDict iDict;
    map
<stringstring> dictMap, mapBuckets;
    vector
<DocIdx> vecDocIdx;    //Document。h

    CQuery iQuery;
    iQuery.GetInputs();        
//具体程序开始执行
    
// current query & result page number
    iQuery.SetQuery();
    iQuery.SetStart();

    
// begin to search
    
//开始具体搜索程序
    gettimeofday(&begin_tv,&tz);    //开始计时获取程序运行时间差

    iQuery.GetInvLists(mapBuckets);        
//将所有字符集存入映射变量中    瓶颈所在
    iQuery.GetDocIdx(vecDocIdx);        //将倒排索引存入向量中        瓶颈所在
    
    CHzSeg iHzSeg;        
//include ChSeg/HzSeg.h
    iQuery.m_sSegQuery = iHzSeg.SegmentSentenceMM(iDict, iQuery.m_sQuery);    //将get到的查询变量分词分成 "我/        爱/        你们/    的/        格式"
    
    vector
<string> vecTerm;
    iQuery.ParseQuery(vecTerm);        
//将以"/"划分开的关键字一一顺序放入一个向量容器中
    
    
set<string> setRelevantRst; 
    iQuery.GetRelevantRst(vecTerm, mapBuckets, setRelevantRst); 
    
    gettimeofday(
&end_tv,&tz);
    
// search end
    
//搜索完毕

    
//下面开始显示
    CDisplayRst iDisplayRst; 
    iDisplayRst.ShowTop(); 

    
float used_msec = (end_tv.tv_sec-begin_tv.tv_sec)*1000 
        
+((float)(end_tv.tv_usec-begin_tv.tv_usec))/(float)1000

    iDisplayRst.ShowMiddle(iQuery.m_sQuery,used_msec, 
            setRelevantRst.size(), iQuery.m_iStart);

    iDisplayRst.ShowBelow(vecTerm,setRelevantRst,vecDocIdx,iQuery.m_iStart); 

    
return 0;

}

 

你可能感兴趣的:(搜索引擎,HTTP服务器,百度,cgi,action,border)