抓取http网页的全部链接

Asp.net中抓取网页的全部链接

效果图:

后台代码实现:

using System;
using System.Collections;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Web;
using System.Web.SessionState;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.HtmlControls;
using System.Net;
using System.IO;
using System.Collections;
using System.Text.RegularExpressions;

namespace getwebsite
... {
/**////
///WebForm1的摘要说明。
///

publicclassWebForm1:System.Web.UI.Page
...{
protectedSystem.Web.UI.WebControls.TextBoxTextBox1;
protectedSystem.Web.UI.WebControls.ButtonButton1;
protectedSystem.Web.UI.WebControls.TextBoxTextBox2;
protectedSystem.Web.UI.WebControls.RegularExpressionValidatorRegularExpressionValidator1;

privatevoidPage_Load(objectsender,System.EventArgse)
...{
if(!this.IsPostBack)
...{

}

//在此处放置用户代码以初始化页面
}


Web窗体设计器生成的代码#regionWeb窗体设计器生成的代码
overrideprotectedvoidOnInit(EventArgse)
...{
//
//CODEGEN:该调用是ASP.NETWeb窗体设计器所必需的。
//
InitializeComponent();
base.OnInit(e);
}


/**////
///设计器支持所需的方法-不要使用代码编辑器修改
///此方法的内容。
///

privatevoidInitializeComponent()
...{
this.Button1.Click+=newSystem.EventHandler(this.Button1_Click);
this.Load+=newSystem.EventHandler(this.Page_Load);

}

#endregion


privatevoidButton1_Click(objectsender,System.EventArgse)
...{
this.TextBox2.Text="";
stringweb_url=this.TextBox1.Text;
stringall_code="";
HttpWebRequestall_codeRequest
=(HttpWebRequest)WebRequest.Create(web_url);
WebResponseall_codeResponse
=all_codeRequest.GetResponse();
StreamReadersr
=newStreamReader(all_codeResponse.GetResponseStream());
all_code
=sr.ReadToEnd();
sr.Close();
ArrayListmy_list
=newArrayList();
stringp=@"http://([w-]+.)+[w-]+(/[w-./?%&=]*)?";
Regexre
=newRegex(p,RegexOptions.IgnoreCase);
MatchCollectionmc
=re.Matches(all_code);

for(inti=0;i<=mc.Count-1;i++)
...{
bool_foo=false;
stringname=mc[i].ToString();
foreach(stringlistinmy_list)
...{
if(name==list)
...{
_foo
=true;
break;
}

}
//过滤

if(!_foo)
...{
TextBox2.Text
+=name+" ";
}

}



}

}

}

你可能感兴趣的:(抓取http网页的全部链接)