没事抓点新闻看

# encoding: utf-8
require 'rubygems'
require 'mechanize'

class FetchNews

	def self.get_page(keyword)
		a = Mechanize.new { |agent| agent.user_agent_alias = 'Mac Safari' }
		base_url = "http://new.baidu.com/"
		kw = URI.encode(keyword)
		url = "http://news.baidu.com/ns?word=#{kw}&tn=news&from=news&cl=2&rn=20&ct=1"
		page = a.get(url) 
	end

	def self.parse_html(page)
		result = []
		page.body.to_s.force_encoding("UTF-8").split("result").each do |line|
			next unless line =~ /class=\"c-title\"/
			str = line.to_s.strip
			str =~ /(><a href=\")(.+)(\")/
			url = $2
			str =~ /(>)(.+)(<\/a><\/h3>)(.+)(<p class="c-author">)(.+)(&nbsp;)(.+)(\d{4}-\d{2}-\d{2})/
			title, source, date = $2, $6, $9
			title = CGI.unescapeHTML(title.to_s.gsub(/[em,<,>,\/]/, ""))
			url = url.to_s.strip.gsub(/["]/, "")
			source = source.to_s.strip.gsub("&nbsp;","")
			next if (title && url && date).nil?
			result << {:title => title.to_s, :url => url, :date => date, :source => source}
		end
		result
	end

	def self.get_touch_news(keyword)
		page = get_page(keyword)
		parse_html(page)
	end
end
# start = Time.now
# news = FetchNews.get_touch_news("rails")
# puts news
# puts "cost #{Time.now - start}"



 

欢迎关注微信订阅号,查看更多ruby & rails相关技术:


没事抓点新闻看
 

你可能感兴趣的:(新闻,news,mechanize)