C#提取网页中的超链接

最近在做一个WP7的音乐播放器,在网络搜索功能中需要用到提取网页超链接。

在网上找到一个不错的例子;

using System;

  using System.Xml;

  using System.Text;

  using System.Net;

  using System.IO;

  using System.Collections;

  using System.Text.RegularExpressions;

  public class App

  {

   public static void Main()

   { 

   string strCode;

   ArrayList alLinks;



   Console.Write("请输入一个网页地址:"); 

   string strURL = Console.ReadLine();

   if(strURL.Substring(0,7) != @"http://")

   {

    strURL = @"http://" + strURL;

   }



    Console.WriteLine("正在获取页面代码,请稍侯..."); 

   strCode = GetPageSource(strURL);



    Console.WriteLine("正在提取超链接,请稍侯..."); 

   alLinks = GetHyperLinks(strCode);



    Console.WriteLine("正在写入文件,请稍侯..."); 

   WriteToXml(strURL,alLinks);

   }



  // 获取指定网页的HTML代码

   static string GetPageSource(string URL)

   {

   Uri uri =new Uri(URL);

    HttpWebRequest hwReq = (HttpWebRequest)WebRequest.Create(uri);

   HttpWebResponse hwRes = (HttpWebResponse)hwReq.GetResponse();

    hwReq.Method = "Get";

    hwReq.KeepAlive = false;

    StreamReader reader = new StreamReader(hwRes.GetResponseStream(),System.Text.Encoding.GetEncoding("GB2312"));



    return reader.ReadToEnd();

  }





      // 提取HTML代码中的网址

   static ArrayList GetHyperLinks(string htmlCode)

   {

   ArrayList al = new ArrayList();



    string strRegex = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?";



     Regex r = new Regex(strRegex,RegexOptions.IgnoreCase);

   MatchCollection m = r.Matches(htmlCode);



    for(int i=0; i<=m.Count-1; i++)

   {

    bool rep = false;

    string strNew = m[i].ToString();

     // 过滤重复的URL

    foreach(string str in al)

    {

    if(strNew==str)

    {

     rep =true;

     break;

    }  

    }

   if(!rep) al.Add(strNew);

  }

  al.Sort();

  return al;

  }





      // 把网址写入xml文件

   static void WriteToXml(string strURL, ArrayList alHyperLinks)

   {

   XmlTextWriter writer = new XmlTextWriter("HyperLinks.xml",Encoding.UTF8); 



   writer.Formatting = Formatting.Indented;

   writer.WriteStartDocument(false);

   writer.WriteDocType("HyperLinks", null, "urls.dtd", null);

   writer.WriteComment("提取自" + strURL + "的超链接"); 

   writer.WriteStartElement("HyperLinks");

   writer.WriteStartElement("HyperLinks", null);

   writer.WriteAttributeString("DateTime",DateTime.Now.ToString());





   foreach(string str in alHyperLinks)

   {

    string title = GetDomain(str);

    string body = str;

    writer.WriteElementString(title,null,body);

   }





  writer.WriteEndElement();

   writer.WriteEndElement();





  writer.Flush();

   writer.Close();

   }
      // 获取网址的域名后缀

   static string GetDomain(string strURL)

   {

   string retVal;

     string strRegex = @"(\.com/|\.net/|\.cn/|\.org/|\.gov/)";

     Regex r = new Regex(strRegex,RegexOptions.IgnoreCase); 

   Match m = r.Match(strURL);

   retVal = m.ToString();

     strRegex = @"\.|/$";

   retVal = Regex.Replace(retVal, strRegex, "").ToString();

     if(retVal == "")

     retVal = "other";

    return retVal;

   }

  }

  

你可能感兴趣的:(超链接)