一、爬取京东评论
京东评论竟然全部对外开放
public class CommentCrawler {
final static PoolingHttpClientConnectionManager httpClientConnectionManager = new PoolingHttpClientConnectionManager();
final static int MAX_PAGE = 50;
static HttpClient getClient() {
return HttpClients.custom().setConnectionManager(httpClientConnectionManager).build();
}
static String getUrl(String productId, int page) {
return String.format(
"http://sclub.jd.com/comment/productPageComments.action?productId=%s&score=0&sortType=3&page=%d&pageSize=10",
productId, page);
}
static Comment commentFromJson(JSONObject json, String productId) {
return new Comment(json.getLongValue("id"), productId, json.getString("score"), json.getString("content"));
}
public static boolean crawlComments(String productId) {
try {
int maxPage = 1;
int nowPage = 0;
HttpClient client = getClient();
while (nowPage < maxPage) {
String url = getUrl(productId, nowPage);
HttpGet get = new HttpGet(url);
HttpResponse resp = client.execute(get);
JSONObject json = JSON.parseObject(EntityUtils.toString(resp.getEntity()));
JSONArray comments = json.getJSONArray("comments");
if (comments.size() == 0)
return false;
CommentService ser = new CommentService();
for (int i = 0; i < comments.size(); i++) {
Comment comment = commentFromJson(comments.getJSONObject(i), productId);
ser.insertComment(comment);
}
if (nowPage == 0) {
maxPage = json.getInteger("maxPage");
ser.insertProduct(new Product(productId, comments.getJSONObject(0).getString("referenceName")));
}
nowPage++;
}
ProductJudger.judge(productId);
return true;
} catch (Exception e) {
e.printStackTrace();
}
return false;
}
}
二、结巴分词
jieba分词原本是Python版的,有人把它改成了Java版,名字也改成了jieba-analysis
JiebaSegmenter segmenter = new JiebaSegmenter();
List list = segmenter.sentenceProcess(str);
for (Word i : list) {
String token = i.getToken();
if (stopWords.contains(token)) {
continue;
}
Integer cnt = map.get(token);
if (cnt == null) {
cnt = 0;
}
map.put(token, new Integer(cnt + 1));
}
三、词云图
用到d3.js,d3-cloud.js这两个库,d3.js是“Data Driven Document”,d3-cloud这个库还是比较难用的,主要是官方实例代码太少了。
这里给出一个例子:每一个词云图都对应一个字典,这个字典就是“词语:频率”这样的键值对。给定多个字典,每一个字典都要渲染成一个词云图。
还需要编写如下JS代码
var fill = d3.scale.category20();// 20种颜色
var wordCloudWidth = 800, wordCloudHeight = 400;
var font_name = "楷体", font_weight = "bold", max_font_size = 50;
var word_count = 50;// 显示词汇个数
var word_max_size = 60;// 显示词汇字体最大字号
var word_min_size = 10;// 显示词汇字体最小字号
/*
* 函数名称:transformWordFrequency
* 参数words:字典类型,形如“{word1:cnt1,word2:cnt2,word3:cnt3}” 返回值:{text,size}数组
*/
function transformWordFraquency(words) {
var ar = []
for ( var i in words) {
ar.push({
"text" : i,
"size" : words[i]
})
}
// 按照字体的大小从大到小进行排序,只取出现次数较多的前几名
ar.sort(function(x, y) {
return y['size'] - x['size'];
})
ar = ar.slice(0, Math.min(word_count, ar.length));
for (var i = 0; i < ar.length; i++) {
ar[i]['size'] = word_max_size - (word_max_size - word_min_size)
/ ar.length * i;
}
return ar;
}
/*
* wordMap是[{text:"",size:""}]形式的数组 selector是即将渲染到的目标位置
*/
function createWordCloud(wordMap, selector) {
d3.layout.cloud().size(
[ wordCloudWidth * 2 - 100, wordCloudHeight * 2 - 100 ]).words(
wordMap).font(font_name).fontWeight(font_weight).fontSize(
function(d) {
return d.size;
}).rotate(function() {
return 0;
}).on("end", function(words) {
renderWordCloud(words, selector)
}).start();
}
/*
* 像这种风格的函数调用,这些函数的调用顺序不能变
*/
function renderWordCloud(words, selector) {
d3.select(selector).append("svg").attr("width", wordCloudWidth).attr(
"height", wordCloudHeight).append("g")
.attr(
"transform",
"translate(" + wordCloudWidth / 2 + "," + wordCloudHeight
/ 2 + ")").selectAll("text").data(words).enter()// 进入words,相当于for循环
.append("text").style("font-family", font_name).style(
"font-weight", font_weight)
// .attr("text-anchor", "middle")
.style("font-size", function(d) {// 字体大小
return d.size + "px";
}).style("fill", function(d, i) {// 字体颜色
return fill(i);
}).attr(
"transform",
function(d) {
return "translate(" + [ d.x, d.y ] + ") rotate("
+ d.rotate + ")";
}).text(function(d) {
return d.text;
});
}