C#爬虫获取月之门游戏音乐在线试听地址(C#源码)


免积分源码地址







      个人比较喜欢小旭和月之门出品的游戏音乐。发现月之门不知道什么时候换了个漂亮的网站。而且也提供了音乐在线试听的页面。如图所示。C#爬虫获取月之门游戏音乐在线试听地址(C#源码)_第1张图片


地址是  http://www.cyberweaver.net/demo.asp

查看了一下网页源代码。发现所有的音乐地址都存在于百度网盘中。于是整了一个爬虫将地址都抓取出来了。




因为很简单。没什么多解释的。就直接上代码。是C#的。


using System;
using System.Collections;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using System.Windows.Forms;
using System.Xml;

namespace GetUrl
{
    public partial class Form1 : Form
    {

        string strCode;
        ArrayList alLinks;

        public Form1()
        {
            InitializeComponent();
        }


        private void button1_Click(object sender, EventArgs e)
        {
            if (textBox1.Text == "")
            {
                MessageBox.Show("请输入网址");
                return;
            }
            string strURL = textBox1.Text.ToString().Trim();
            if (strURL.Substring(0, 7) != @"http://")
            {
                strURL = @"http://" + strURL;
            }
            MessageBox.Show("正在获取页面代码,请稍后...");
            strCode = GetPageSource(strURL);
            MessageBox.Show("正在提取超链接,请稍侯...");
            alLinks = GetHyperLinks(strCode);
            MessageBox.Show("正在写入文件,请稍侯...");
            WriteToXml(strURL, alLinks);
        }

        // 获取指定网页的HTML代码 
        public static string GetPageSource(string URL)
        {
            Uri uri = new Uri(URL);

            HttpWebRequest hwReq = (HttpWebRequest)WebRequest.Create(uri);
            HttpWebResponse hwRes = (HttpWebResponse)hwReq.GetResponse();

            hwReq.Method = "Get";

            hwReq.KeepAlive = false;

            StreamReader reader = new StreamReader(hwRes.GetResponseStream(), System.Text.Encoding.GetEncoding("GB2312"));

            return reader.ReadToEnd();
        }
        // 提取HTML代码中的网址 
        public static ArrayList GetHyperLinks(string htmlCode)
        {
            ArrayList al = new ArrayList();            
            string strRegex = @"demo_class.asp?sort=([^a-zA-Z0-9])+";
            strRegex = @"demo_class.asp\?sort=([^x00-xff]{4})&id=606";
            strRegex = @"demo_class.asp\?sort=([^x00-xff]{4})&id\=([a-z0-9]+)";

            Regex r = new Regex(strRegex, RegexOptions.IgnoreCase);
            MatchCollection m = r.Matches(htmlCode);

            for (int i = 0; i <= m.Count - 1; i++)
            {
                bool rep = false;
                string strNew = m[i].ToString();

                // 过滤重复的URL 
                foreach (string str in al)
                {
                    if (strNew == str)
                    {
                        rep = true;
                        break;
                    }
                }

                if (!rep) al.Add(strNew);
            }

            al.Sort();

            return al;
        }
      
     // 提取HTML代码中的网址 
        public static string GetHyperLinks_Song(string htmlCode)
        {
            //http://pan.baidu.com/share/link?shareid=112744&uk=455690558
            string strRegex;
            strRegex = @"http://pan.baidu.com/share/link\?shareid=([a-z0-9]+)&uk=([a-z0-9]+)";

            Regex r = new Regex(strRegex, RegexOptions.IgnoreCase);
            MatchCollection m = r.Matches(htmlCode);

            for (int i = 0; i <= m.Count - 1; i++)
            {
                bool rep = false;
                string strNew = m[i].ToString();   
                return strNew;
            }
            return null;
        }

        // 把网址写入xml文件 
        static void WriteToXml(string strURL, ArrayList alHyperLinks)
        {
            XmlTextWriter writer = new XmlTextWriter("HyperLinks.xml", Encoding.UTF8);

            writer.Formatting = Formatting.Indented;
            writer.WriteStartDocument(false);
            writer.WriteDocType("HyperLinks", null, "urls.dtd", null);
            writer.WriteComment("提取自" + strURL + "的超链接");
            writer.WriteStartElement("HyperLinks");
            writer.WriteStartElement("HyperLinks", null);
            writer.WriteAttributeString("DateTime", DateTime.Now.ToString());


            foreach (string str in alHyperLinks)
            {
                string title = GetDomain(str);
                string body = str;
                writer.WriteElementString(title, null, body);
            }

            writer.WriteEndElement();
            writer.WriteEndElement();

            writer.Flush();
            writer.Close();
        }

        static void WriteToTxt(string strURL)
        {
            string strAddrTxt = "月之门游戏音乐地址.txt";
            if (File.Exists(strAddrTxt))
            {
                StreamWriter sw = new StreamWriter(strAddrTxt, true, Encoding.Default);
                //该编码类型不会改变已有文件的编码类型
                sw.WriteLine(strURL);
                sw.Close();
                return ;
            }
            else
            {
                try
                {
                    FileStream fs = new FileStream(strAddrTxt, FileMode.CreateNew);
                    fs.Close();
                    StreamWriter sw = new StreamWriter(strAddrTxt, true, Encoding.Default);
                    //该编码类型不会改变已有文件的编码类型
                    sw.WriteLine(strURL);
                    sw.Close();
                    return ;
                }
                catch (Exception e)
                {
                    MessageBox.Show(e.Message.ToString());
                    return ;
                }
            }
        }

        // 获取网址的域名后缀 
        static string GetDomain(string strURL)
        {
            string retVal;

            string strRegex = @"(\.com/|\.net/|\.cn/|\.org/|\.gov/)";

            Regex r = new Regex(strRegex, RegexOptions.IgnoreCase);
            Match m = r.Match(strURL);
            retVal = m.ToString();

            strRegex = @"\.|/$";
            retVal = Regex.Replace(retVal, strRegex, "").ToString();

            if (retVal == "")
                retVal = "other";

            return retVal;
        }

        //直接分析超链接
        private void button1_Click_1(object sender, EventArgs e)
        {
            string strContent = null;
            ArrayList arry_Link2_DemoPage;

            //获取网页
            if (textBox1.Text == "")
            {
                MessageBox.Show("请输入网址");
                return;
            }
            string strURL = textBox1.Text.ToString().Trim();
            if (strURL.Substring(0, 7) != @"http://")
            {
                strURL = @"http://" + strURL;
            }
            MessageBox.Show("正在获取页面代码,请稍后...");
            strContent = GetPageSource(strURL);

            //分析 http://www.cyberweaver.net/demo_class.asp 的超链接
            arry_Link2_DemoPage = GetHyperLinks(strContent);


            //下载 Demo 分页内容并分析其中的 http://pan.baidu.com/share/link?shareid=112744&uk=455690558 这种超链接
            for (int i = 0; i < arry_Link2_DemoPage.Count;i++ )
            {
                string strDemoContent = null;
                string strUrlDemo = "http://www.cyberweaver.net/" + arry_Link2_DemoPage[i];
                strDemoContent = GetPageSource(strUrlDemo);
                string strSongAddr = GetHyperLinks_Song(strDemoContent);

                //这里获取到了文件地址写入xml
                WriteToTxt(strSongAddr);
                
            }

        }




        //下载网页到本地
        private void button2_Click(object sender, EventArgs e)
        {
            if (textBox1.Text == "")
            {
                MessageBox.Show("请输入网址");
                return;
            }

            SaveFileDialog sfg = new SaveFileDialog();
            sfg.Filter = "网页文件(*.html)|*.html;*.htm";
            if (sfg.ShowDialog() == DialogResult.OK)
            {
                string strURL = textBox1.Text.ToString().Trim();
                if (strURL.Substring(0, 7) != @"http://")
                {
                    strURL = @"http://" + strURL;
                }

                //这里也可以填入一张图片的地址然后下载该图片
                WebRequest request = WebRequest.Create(strURL);
                WebResponse response = request.GetResponse();
                Stream reader = response.GetResponseStream();
                FileStream writer = new FileStream(sfg.FileName, FileMode.OpenOrCreate, FileAccess.Write);
                byte[] buff = new byte[512];
                int c = 0; //实际读取的字节数
                while ((c = reader.Read(buff, 0, buff.Length)) > 0)
                {
                    writer.Write(buff, 0, c);
                }
                writer.Close();
                writer.Dispose();
                reader.Close();
                reader.Dispose();
                response.Close();
            }
        }




        //分析地址 http://www.cyberweaver.net/demo.asp  中的超链接
        private void button3_Click(object sender, EventArgs e)
        {

        }







        //测试正则表达式
        private void button3_Click_1(object sender, EventArgs e)
        {
            string strContent = null;
            try
            {
                using (StreamReader sr = new StreamReader("月之门在线试听网页.html", Encoding.GetEncoding("gb2312")))
                {                   
                    String line = null;
                    while ((line = sr.ReadLine()) != null)
                    {
                        strContent += line.ToString();
                    }
                }
            }
            catch (Exception e2)
            {
                Console.WriteLine(e2.Message);
            }



            ArrayList al = new ArrayList();

            //string strRegex = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?";
            //<a href="demo_class.asp?sort=北京&id=605">
            //先将 "" 替换为 空
            strContent = Regex.Replace(strContent, "\"", "");
            

            string strRegex = @"demo_class.asp?sort=([^a-zA-Z0-9])+";
            strRegex = @"demo_class.asp\?sort=([^x00-xff]{4})&id=606";
            strRegex = @"demo_class.asp\?sort=([^x00-xff]{4})&id\=([a-z0-9]+)";
           
            Regex r = new Regex(strRegex, RegexOptions.IgnoreCase);
            MatchCollection m = r.Matches(strContent);

            for (int i = 0; i <= m.Count - 1; i++)
            {
                bool rep = false;
                string strNew = m[i].ToString();

                // 过滤重复的URL 
                foreach (string str in al)
                {
                    if (strNew == str)
                    {
                        rep = true;
                        break;
                    }
                }

                if (!rep) al.Add(strNew);
            }

            al.Sort();
        }

    }
}

最终得到的在线试听地址如下


C#爬虫获取月之门游戏音乐在线试听地址(C#源码)_第2张图片

 



现在难度在于如何批量从百度网盘中下东西。我的思路是将百度网盘的下载地址转换为迅雷地址,然后通过迅雷的批量任务下载。但是百度网盘转迅雷地址怎么实现,我还没有办法。希望高手不吝赐教



witch 2013-1-9


你可能感兴趣的:(C#爬虫获取月之门游戏音乐在线试听地址(C#源码))