新闻采集源码可自写规则

 

using System;
using System.Data;
using System.Configuration;
using System.Collections;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.Text.RegularExpressions;
using System.Collections;
using System.IO;
using System.Net;
using System.Text;
namespace NewsCollection
{
    public partial class SiteEdit : System.Web.UI.Page
    {
        string urlData = "";
        protected void Page_Load(object sender, EventArgs e)
        {
            this.BtCollection.Attributes.Add("onclick", "Status.showInfo('加载中');");
            if (!Page.IsPostBack)
            {
                AjaxAction();
                BindData();
            }
        
        }
        public string GetRequest(string key)
        {
            key = Convert.ToString(Request[key]??"");
            key = key==null?(""):(key);
            return key;
        }
        public void AjaxAction()
        {
            string isAjax = GetRequest("isAjax").ToLower();
            if (isAjax == "true")
            {
                string state = "";
                string action = this.GetRequest("action").ToLower();
                string values = this.GetRequest("values");
                if (action == "newsbody")
                {
                    string modelstart = this.GetRequest("modelstart");
                    string modelend = this.GetRequest("modelend");
                    string modelbody = this.GetRequest("modelbody");
                    string siteUrl = this.GetRequest("siteUrl");
                    ArrayList al = this.GetModelData(modelstart, modelend, modelbody, siteUrl);
                    StringBuilder sb = new StringBuilder();
                    foreach (string s in al)
                    {
                        sb.Append(s);
                    }
                    state = sb.ToString();
                }
                else if (action == "newsdetail")
                {
                    string modelstart = this.GetRequest("modelstart");
                    string modelend = this.GetRequest("modelend");
                    string modelbody = this.GetRequest("modelbody");
                    string siteUrl = this.GetRequest("siteUrl");
                    string newsTitleStart = this.GetRequest("newsTitleStart");
                    string newsTitleEnd = this.GetRequest("newsTitleEnd");
                    string newsContentStart = this.GetRequest("newsContentStart");
                    string newsContentEnd = this.GetRequest("newsContentEnd");
                    state = GetNews(GetModelData(modelstart, modelend, modelbody, siteUrl), newsTitleStart, newsTitleEnd, newsContentStart, newsContentEnd);
                }
                else
                {
                    state = "test Ajax";
                }
                Response.Clear();
                Response.Write(state);
                Response.End();
            }
        }
        public void BindData()
        {
            string Gid = Convert.ToString(Request["Gid"] ?? "");
            if (Gid.Length > 0)
            {
                Beans.Sites sites = new Beans.Sites();
                sites.Gid = Gid;
                sites = sites.SelectById();
                TbSiteName.Text  = sites.SiteName;
                TbSiteUrl.Text =sites.SiteUrl;
                TbSiteModelStart.Text = sites.SiteModelStart;
                TbSiteModelEnd.Text =sites.SiteModelEnd;
                TbSiteModelBody.Text = sites.SiteModelBody;
                TbNewsTitleStart.Text =sites.NewsTitleStart;
                TbNewsTitleEnd.Text =sites.NewsTitleEnd;
                TbNewsContentStart.Text =sites.NewsContentStart;
                TbNewsContentEnd.Text =sites.NewsContentEnd;
            }
        }
        protected void BtEdit_Click(object sender, EventArgs e)
        {
            string message = "系统错误请重试";
            string script = "history.go(-1)";
            string Gid = Convert.ToString(Request["Gid"]??"");
            Beans.Sites sites = new Beans.Sites();
            sites.SiteName = TbSiteName.Text.Trim();
            sites.SiteUrl = TbSiteUrl.Text.Trim();
            sites.SiteModelStart = TbSiteModelStart.Text.Trim();
            sites.SiteModelEnd = TbSiteModelEnd.Text.Trim();
            sites.SiteModelBody = TbSiteModelBody.Text.Trim();
            sites.NewsTitleStart = TbNewsTitleStart.Text.Trim();
            sites.NewsTitleEnd = TbNewsTitleEnd.Text.Trim();
            sites.NewsContentStart = TbNewsContentStart.Text.Trim();
            sites.NewsContentEnd = TbNewsContentEnd.Text.Trim();
            if (Gid.Length > 0)
            {
                sites.Gid=Gid;
                if (sites.Update())
                {
                    message = "修改成功!";
                }
             
            }
            else
            {
                if (sites.Add())
                {
                    message = "添加成功!";
                }
              
            }
            Response.Write("<script type='text/javascript'>alert('"+message+"');"+script+"</script>");
         
        }

        protected void BtCollection_Click(object sender, EventArgs e)
        {
            GetNews(GetModelData(TbSiteModelStart.Text.Replace("(", "\\(").Replace(")", "\\)").Replace(".", "\\."), TbSiteModelEnd.Text.Replace("(", "\\(").Replace(")", "\\)").Replace(".", "\\."), TbSiteModelBody.Text, TbSiteUrl.Text), TbNewsTitleStart.Text.Trim(), TbNewsTitleEnd.Text.Trim(), TbNewsContentStart.Text.Trim(), TbNewsContentEnd.Text.Trim());
     
           // GetNews(GetModelData(TbSiteModelStart.Text.Replace("(", "\\(").Replace(")", "\\)").Replace(".", "\\."), TbSiteModelEnd.Text.Replace("(", "\\(").Replace(")", "\\)").Replace(".", "\\."), TbSiteModelBody.Text, TbSiteUrl.Text));
        }
        public ArrayList GetModelData(string modelstart,string modelend,string modelbody,string SiteUrl)
        {

            ArrayList al = new ArrayList();
            string content = GetHttpData(SiteUrl,"gb2312");
            Regex reg = new Regex(modelstart+"(?<newsBody>[\\s\\S]*)"+modelend, RegexOptions.IgnoreCase | RegexOptions.Multiline);
            Match mat = reg.Match(content);
            //TbContent.Text = mat.Groups["newsBody"].Value.ToString();
            Regex regurl = new Regex(modelbody.Replace("_url_", "(?<url>[^\"'\\s]+)"), RegexOptions.IgnoreCase | RegexOptions.Singleline);
            Match maturl = regurl.Match(mat.Groups["newsBody"].Value.ToString());
            while (maturl.Success)
            {
                //z-zA-A0-9/\\.:
                string temp = maturl.Groups["url"].Value;
                al.Add(temp);
                Response.Write(temp.StartsWith("http://") ? (temp) : (temp.Insert(0, "http://" + SiteUrl.Replace("http://","").Substring(0, SiteUrl.LastIndexOf("/")))) + "<br>");
                maturl = maturl.NextMatch();
            }
            return  al;
        }
    
        public string  GetNews(ArrayList al,string titleStart,string titleEnd,string contentStart,string contetnEnd)
        {
            StringBuilder sb = new StringBuilder();
            if (al != null)
            {
            
                foreach (string s in al)
                {

                    Regex reg = new Regex(titleStart.Replace("(", "\\(").Replace(")", "\\)") + "(?<title>[^<]*)" + titleEnd.Replace("(", "\\(").Replace(")", "\\)") + "[\\s\\S]+" + contentStart.Replace("(", "\\(").Replace(")", "\\)") + "(?<content>[\\s\\S]+)" + contetnEnd.Replace("(", "\\(").Replace(")", "\\)"), RegexOptions.IgnoreCase | RegexOptions.Multiline);
                    Match mat = reg.Match(GetHttpData(s, "gb2312"));
                    //Response.Write(string.Format("news:{0}<br>content:{1}<br>",mat.Groups["title"].Value,mat.Groups["content"].Value));
                    Beans.News news = new Beans.News();
                    news.Title = mat.Groups["title"].Value;
                    news.Typeid = Convert.ToString(Request["Gid"]);//mat.Groups["title"].Value;
                    news.From = this.TbSiteName.Text;//mat.Groups["content"].Value;
                    news.Content = mat.Groups["content"].Value;
                    news.Add();
                    sb.AppendFormat("title:{0}", mat.Groups["title"].Value);
                }
            }
            return sb.ToString();
        }
        public string GetHttpData(string sUrl, string encoding)
        {
            string sRslt = null;
            WebResponse oWebRps = null;
            WebRequest oWebRqst = WebRequest.Create(sUrl);
            oWebRqst.Timeout = 50000;
            try
            {
                oWebRps = oWebRqst.GetResponse();
            }

            finally
            {
                if (oWebRps != null)
                {
                    StreamReader oStreamRd = new StreamReader(oWebRps.GetResponseStream(), System.Text.Encoding.GetEncoding(encoding));
                    sRslt = oStreamRd.ReadToEnd();
                    oStreamRd.Close();
                    oWebRps.Close();
                }
            }
            return sRslt;
        }

    

   
    }
}

 

你可能感兴趣的:(源码)