12/13补充:召回不变,目前采用ctr预估加上规则排序,后续上ltr。
废话少说,上代码:
public class Main {
static final String ZK_QUORUM = "*.*.*.*:2181,*.*.*.*:2181,*.*.*.*:2181/kafka";
static final String GROUP = "test-consumer-group";
static final String TOPICSS = "user_trace";
static final String NUM_THREAD = "64";
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setAppName("main.java.computingCenter");
// Create the context with 2 seconds batch size
//每两秒读取一次kafka
JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, new Duration(2000));
int numThreads = Integer.parseInt(NUM_THREAD);
Map topicMap = new HashMap();
String[] topics = TOPICSS.split(",");
for (String topic: topics) {
topicMap.put(topic, numThreads);
}
JavaPairReceiverInputDStream messages =
KafkaUtils.createStream(jssc, ZK_QUORUM, GROUP, topicMap);
JavaDStream lines = messages.map(new Function, String>() {
public String call(Tuple2 tuple2) {
return tuple2._2();
}
});
JavaDStream words = lines.flatMap(new FlatMapFunction() {
public Iterable call(String lines) {
//kafka数据格式:"{\"Topic\":\"user_trace\",\"PartitionKey\":\"0\",\"TimeStamp\":1471524044018,\"Data\":\"0=163670589171371918%3A196846178238302087\",\"LogId\":\"0\",\"ContentType\":\"application/x-www-form-urlencoded\"}";
List arr = new ArrayList();
for (String s : lines.split(" ")) {
Map j = JSON.parseObject(s);
String s1 = "";
String s2 = "";
try {
s1 = URLDecoder.decode(j.get("Data").toString(), "UTF-8");
s2 = s1.split("=")[1];
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
arr.add(s2);
}
return arr;
}
});
JavaPairDStream goodsSimilarityLists = words.filter(new Function() {
@Override
public Boolean call(String s) throws Exception {
//过滤非法的数据
if (s.split(":").length == 2) {
return true;
}
return false;
}
}).mapPartitionsToPair(new PairFlatMapFunction, String, String>() {
//此处分partition对每个pair进行处理
@Override
public Iterable> call(Iterator s) throws Exception {
ArrayList> result = new ArrayList>();
while (s.hasNext()) {
String x = s.next();
String userId = x.split(":")[0];
String goodsId = x.split(":")[1];
System.out.println(x);
LinkedHashMap recommendMap = null;
try {
//此service从redis读数据,进行实时兴趣度计算,推荐结果写入redis,供api层使用
CalculateInterestService calculateInterestService = new CalculateInterestService();
try {
recommendMap = calculateInterestService.calculateInterest(userId, goodsId);
} catch (Exception e) {
e.printStackTrace();
}
String text = "";
int count = 0;
for (Map.Entry entry : recommendMap.entrySet()) {
text = text + entry.getKey();
if (count == recommendMap.size() - 1) {
break;
}
count = count + 1;
text = text + "{/c}";
}
text = System.currentTimeMillis() + ":" + text;
result.add(new Tuple2(userId, text));
} catch (Exception e) {
e.printStackTrace();
}
}
return result;
}
});
goodsSimilarityLists.foreachRDD(new Function, Void>() {
@Override
public Void call(JavaPairRDD rdd) throws Exception {
//打印rdd,调试方便
System.out.println(rdd.collect());
return null;
}
});
JavaPairDStream goodsSimilarityListsText = goodsSimilarityLists.mapToPair(new PairFunction, Text, Text>(){
@Override
public Tuple2 call(Tuple2 ori) throws Exception {
//此处要将tuple2转化为org.apache.hadoop.io.Text格式,使用saveAsHadoopFiles方法写入hdfs
return new Tuple2(new Text(ori._1), new Text(ori._2));
}
});
//写入hdfs
goodsSimilarityListsText.saveAsHadoopFiles("/user/hadoop/recommend_list/rl", "123", Text.class, Text.class, SequenceFileOutputFormat.class);
jssc.start();
jssc.awaitTermination();
}
}
public class CalculateInterestService {
private String dictKey = "greate_item_sim_2.0";
private String recommendTable = "great_recommend_table_2.0";
static final String HIGO_BASE_URL = "jdbc:mysql://*.*.*.*:3212/*";
static final String HIGO_BASE_USER = "*";
static final String HIGO_BASE_PASS = "*";
public LinkedHashMap calculateInterest(String userId, String traceGoodsId) {
LinkedHashMap sortedMap = new LinkedHashMap();
String[] simGoods = RedisHelper.getInstance().hget(dictKey, traceGoodsId).split(",");
//用户的历史记录,应该存action:goodsId:timestamp格式,要重构,bi写入单独的数据表中
HashMap userTrace = null;
try {
userTrace = getUserTrace(userId);
} catch (ClassNotFoundException e) {
e.printStackTrace();
return sortedMap;
}
HashMap recommendMap = new HashMap();
String[] simGoodsIds = new String[simGoods.length];
for (int i = 0; i < simGoods.length; i++) {
simGoodsIds[i] = simGoods[i].split(":")[0];
}
List pSimGoodsIds = RedisHelper.getInstance().hmget(dictKey, simGoodsIds);
HashMap predictSimGoodsIds = new HashMap();
for (int i = 0; i < simGoodsIds.length; i++) {
predictSimGoodsIds.put(Long.parseLong(simGoodsIds[i]), pSimGoodsIds.get(i));
}
for (String item : simGoods) {
//need optimised
Double totalSum = 0.0;
Double sum = 0.0;
Long originGoodsId = Long.parseLong(item.split(":")[0]);
for (String predictGoods : predictSimGoodsIds.get(originGoodsId).split(",")) {
Long goodsId = Long.parseLong(predictGoods.split(":")[0].toString());
Double sim = Double.valueOf(predictGoods.split(":")[1].toString());
totalSum = totalSum + sim;
Double score = 0.0;
if (!userTrace.containsKey(goodsId)) {
//TODO 用户评分矩阵过于稀疏,需要svd补充评分,暂时无评分score为默认0.1
userTrace.put(goodsId, "default");
}
String action = userTrace.get(goodsId);
if (action.equals("click")) {
score = 0.2;
} else if (action.equals("favorate")) {
} else if (action.equals("add_cart")) {
score = 0.6;
} else if (action.equals("order")) {
score = 0.8;
} else if (action.equals("default")) {
score = 0.1;
}
//相似度词典应存 goodsid:sim格式,要重构
sum = sum + score * sim;
}
Double predictResult = sum / totalSum;
recommendMap.put(originGoodsId, predictResult);
}
//sort recommend list
List