准备写个toy:
实现这么几个功能:
从 新浪,凤凰网,搜狐等抓取当天的新闻(目前暂定一天抓三次)。
对他们进行分词,然后看看他们关注的话题的不同。
-----mark 一下
抓取的网页多一点,然后每天对分出来的词进行归类,与hao123的热点新闻比较,看能不能得到点结果。
2013-4-7:
已经可以抓取好多网页的新闻标题了。
package Web; import java.io.*; import java.net.*; import java.util.Date; import java.util.regex.Matcher; import java.util.regex.Pattern; public class CrawlWeb { public static String strNewsTitle; public static String strNewsTitleSeg; public static String rawTitle; public static String strTimeOfBorn; public CrawlWeb() { getTime(); } //public static String strWriteFile = "./os.txt"; public static String[] webURLs = {"http://news.sohu.com/","http://news.ifeng.com/","http://news.qq.com/", "http://news.sina.com.cn/","http://www.xinhuanet.com/","http://news.baidu.com/", "http://news.163.com/","http://www.people.com.cn/","http://news.cntv.cn/", "http://www.chinanews.com/","http://www.zaobao.com/","http://www.huanqiu.com/", "http://www.gov.cn/","http://cn.yahoo.com/","http://www.stnn.cc/", "http://www.cankaoxiaoxi.com/","http://www.takungpao.com/", "http://www.china.com/","http://www.china.com.cn/"}; void getWebPage(String url) { StringBuffer res = new StringBuffer(); try { URL tric = new URL(url); HttpURLConnection con = (HttpURLConnection) tric.openConnection(); con.setRequestProperty("User-Agent", "Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.215 Safari/534.10"); InputStream is = con.getInputStream(); BufferedReader in = new BufferedReader(new InputStreamReader(is)); String line = in.readLine(); int iLineCount = 0; while( null != line) { res.append(line + "\r\n"); line = in.readLine(); iLineCount ++; } in.close(); is.close(); } catch (Exception e){ System.out.println(e); } this.rawTitle = res.toString(); } void getNewsTitle() { StringBuffer res = new StringBuffer(); Pattern p = Pattern.compile(">([^<]+?)</[aA]>"); Matcher m = p.matcher(this.rawTitle); while(m.find()) { res.append(m.group(1) + "\r\n"); } this.strNewsTitle = res.toString(); } void Show() { System.out.println(this.strNewsTitle); } void writeToFile(String content, String savefile) { savefile = ".\\data\\" + savefile + ".txt"; System.out.println(savefile); File file = new File(savefile); try { BufferedWriter bw = new BufferedWriter(new OutputStreamWriter( new FileOutputStream(file,true))); bw.append(content); bw.close(); } catch (FileNotFoundException e) { System.out.println("file not found"); } catch (IOException e) { System.out.println(e); } } void getTime() { Date date=new Date(); String str=date.toString(); String[] strs = str.split(" "); String strTime = strs[5] + "-" + strs[1] + "-" + strs[2] + "-" + strs[3].substring(0, 2); //System.out.println("---" + strTime); this.strTimeOfBorn = strTime; } }
package Web; import java.util.regex.Matcher; import java.util.regex.Pattern; import Web.CrawlWeb; public class main { public static void main(String[] args) { String url; CrawlWeb cw = new CrawlWeb(); System.out.println(cw.strTimeOfBorn); for(int i = 0; i < cw.webURLs.length; i++) { url = cw.webURLs[i]; cw.getWebPage(url); cw.getNewsTitle(); /* * get savefile name. name format : news.sina.com.cn.2013-Apr-07-15 -- websitename.time */ String savefile = ""; Pattern p = Pattern.compile("//(.+?)/"); Matcher m = p.matcher(url); if(m.find()) { savefile = m.group(1) + "." + cw.strTimeOfBorn; } cw.writeToFile(cw.strNewsTitle, savefile); System.out.println("---"); } } }
抓取的数据示例:
------------------
可惜,杂质还是挺多的。
(杂质如何去除:想了半天,发现可以从字符长度入手。一般的新闻标题都不短,暂用长度阈值5过滤吧)