统计文本文件中单词出现的次数
import scala.io.Source
import scala.reflect.io.File
/**
* @Author 多易教育 - 行哥
*
*/
object WordCount {
def main(args: Array[String]): Unit = {
// IO 读取本地磁盘中的数据
val source = Source.fromFile("d://word.txt")
// 获取文件中所有的行数据
val lines: Iterator[String] = source.getLines()
// 统计文本中单词出现的次数
// 处理每行数据 切割 压平 返回所有的单词
val words: Iterator[String] = lines.flatMap(_.split(" "))
// 组装成(单词,1) (单词,1) (单词,1) (单词,1)
// val tuples: Iterator[(String, Int)] = words.map(e => (e, 1))
val tuples: Iterator[(String, Int)] = words.map((_, 1))
// 按照单词的内容分组 相同的单词分到一起 Array List
val group_data: Map[String, List[(String, Int)]] = tuples.toList.groupBy(_._1)
// 统计单词的个数 分组的list的长度 单词个数
val res = group_data.map(e => {
val word = e._1
val count = e._2.size
(word, count)
})
res.foreach(println)
}
}
val d1 = Array(("beijing", 28.1), ("shanghai", 28.7), ("guangzhou", 32.0), ("shenzhen", 33.1))
val d2 = Array(("beijing", 27.3), ("shanghai", 30.1), ("guangzhou", 33.3))
val d3 = Array(("beijing", 28.2), ("shanghai", 29.1), ("guangzhou", 32.0), ("shenzhen", 32.1))
val data: Array[(String, Double)] = d1 ++ d2 ++ d3
// val tuples: Array[(String, Double)] = d1.union(d2).union(d3)
val map: Map[String, Array[(String, Double)]] = data.groupBy(_._1)
val res: Map[String, Double] = map.mapValues(x => {
// 每个元素的第二个数据都加上0 然后所有的数据再相加
x.aggregate(0d)(_ + _._2, _ + _)/x.length
})
map.mapValues(arr=>{
val d = arr.reduce((x,y)=>("",x._2+y._2))
d._2/arr.length
}).foreach(println)
data.groupBy(_._1).map(tp=>{
val city: String = tp._1
val avg = tp._2.map(_._2).sum/tp._2.map(_._2).size
(city , avg)
}).foreach(println)
数据如下 :每个字母代表一个人 , 统计任意一个人和其他人的共同好友
A:B,C,D,F,E,O
B:A,C,E,K
C:F,A,D,I
D:A,E,F,L
E:B,C,D,M,L
F:A,B,C,D,E,O,M
G:A,C,D,E,F
H:A,C,D,E,O
I:A,O
J:B,O
K:A,C,D
L:D,E,F
M:E,F,G
O:A,H,I,J
def main(args: Array[String]): Unit = {
// 读取数据
val bs: BufferedSource = Source.fromFile("data/friends.txt")
val lines: Iterator[String] = bs.getLines()
//处理数据 获取每个人和他们对应的好友列表
val res1: Iterator[(String, Array[String])] = lines.map(line => {
val user = line.split(":", -1)(0)
val fs: Array[String] = line.split(":", -1)(1).split(",", -1)
(user, fs)
})
val arr: Array[(String, Array[String])] = res1.toArray
//双层遍历 依次获取两个人好友的交集
val res = for (i <- 0 until (arr.length - 1); j <- i + 1 until (arr.length)) yield {
val same: Array[String] = arr(i)._2.intersect(arr(j)._2)
(arr(i)._1 + "和" + arr(j)._1 + "的好友有: ", same)
}
// 过滤空值输出结果
res.map(tp => {
val key: String = tp._1
val list: List[String] = tp._2.toList
(key, list)
}).filter(_._2.size > 0).foreach(println)
}
数据如下:
user.txt
uid , name , age , friend
u001,hls,22,fengjie
u002,wangwu,31,lisi
u003,zhangyanru,22,tananpengyou
u004,laocao,26,fengyi
u005,mengqi,12,nvmengqi
u006,haolei,38,sb
u007,wanghongjing,24,wife
u009,wanghongjing,24,wife
order.txt
oid,uid,cost
order011,u001,300
order012,u002,200
order023,u006,100
order056,u007,300
order066,u003,500
order055,u004,300
order021,u005,300
order014,u001,100
order025,u005,300
order046,u007,30
order067,u003,340
order098,u008,310
def main(args: Array[String]): Unit = {
// 加载用户数据和订单数据
val bs1: BufferedSource = Source.fromFile("data/join/user.txt")
val bs2: BufferedSource = Source.fromFile("data/join/orders.txt")
//将用户数据转换成map集合
val users: Iterator[String] = bs1.getLines()
val iters: Iterator[(String, (String, String, String, String))] = users.map(_.split(",", -1)).map(x => (x(0), (x(0), x(1), x(2), x(3))))
val map: Map[String, (String, String, String, String)] = iters.toMap
// 将订单数据转换成list集合
val orders: Iterator[String] = bs2.getLines()
val iters2: Iterator[(String, (String, String))] = orders.map(_.split(",", -1)).map(x => (x(1), (x(0), x(1))))
val list2: List[(String, (String, String))] = iters2.toList
//遍历每个订单 拼接用户信息
var r = list2.map(x => {
val user = map.getOrElse(x._1, ("null", "null", "null", "null"))
(user._1, user._2, user._3, user._4, x._2._1)
})
// 打印结果
r.sortBy(_._1).foreach(println)
}
数据
site1,user1,2018-03-01 02:12:22
site1,user2,2018-03-05 04:12:22
site1,user2,2018-03-05 04:13:22
site1,user2,2018-03-05 04:14:22
site1,user2,2018-03-05 04:15:22
site4,user7,
site1,user2,2018-03-05 05:15:22
site1,user2,2018-03-05 08:15:22
site1,user3,2018-03-05 04:15:22
site1,user4,2018-03-05 05:15:22
site1,user3,2018-03-07 11:12:22
site1,user3,2018-03-08 11:12:22
site2,user4,2018-03-07 15:12:22
site3,user5,2018-03-07 08:12:22
site3,user6,2018-03-05 08:12:22
site1,user1,2018-03-08 11:12:22
site1,,2018-03-08 11:12:22
site2,user2,2018-03-07 15:12:22
site3,user5,2018-03-07 08:12:22
site3,user5,2018-03-07 18:12:22
site3,user6,2018-03-05 08:12:22
site4,user7,2018-03-03 10:12:22
site2,,2018-03-08 11:12:22
site3,user5,2018-03-07 08:12:22
site3,user6,2018-03-05 08:12:22
site4,user5,2018-03-03 10:12:22
site4,user7,2018-02-20 11:12:22
def main(args: Array[String]): Unit = {
val source: BufferedSource = Source.fromFile("data/pvuv/pvuv.txt")
val lines: Iterator[String] = source.getLines()
//过滤数据
val data = lines.filter(line => {
val arr: Array[String] = line.split(",", -1)
arr.length >= 0 && !arr.exists(_.isEmpty)
})
// 切割
val iters: Iterator[Array[String]] = data.map(_.split(",", -1))
//处理每行数据
var tps = iters.map(arr => {
val p = arr(0)
val u = arr(1)
val time = arr(2)
val day: String = time.split("\\s")(0)
val h: String = time.split("\\s")(1).split(":")(0)
(p, u, day, h)
})
// 转换成list page 和 天 分组
val map: Map[(String, String), List[(String, String, String, String)]] = tps.toList.groupBy(x => (x._1, x._3))
val res = map.map(x => {
//页面和天
val value: (String, String) = x._1
// 每天每个页面的访问量
val pv: Int = x._2.size
//去重人的统计个数
val uv: Int = x._2.map(_._2).distinct.size
(value._1, value._2, pv, uv)
})
res.foreach(println)
}
数据如下: 统计线段在每个点重叠的次数 , 并按照从高到低排序输出
1,4
2,5
4,6
2,4
3,6
4,6
1,5
def main(args: Array[String]): Unit = {
// 加载数据
val bs: BufferedSource = Source.fromFile("data/line.txt")
// 获取数据的所有行
val lines: Iterator[String] = bs.getLines()
// 处理每行数据 , 组装成int类型的元组元组
val data: Iterator[(Int, Int)] = lines.map(_.split(",")).map(arr => {
val start: Int = arr(0).toInt
val end: Int = arr(1).toInt
(start, end)
})
//使用推导式生成每个线段经过的所有的点
val res1: Iterator[immutable.IndexedSeq[(Int, Int)]] = data.map(x => {
for (i <- x._1.toInt to x._2.toInt) yield (i, 1)
})
// 将数据压平 列出所有的点
val tuples: List[(Int, Int)] = res1.toList.flatMap(x => x)
// 分组统计每个点出现的次数
val mp: Map[Int, Int] = tuples.groupBy(x => x._1).map(x => (x._1, x._2.size))
// 将结果转换成List集合排序
val sorted: List[(Int, Int)] = mp.toList.sortBy(-_._2)
//输出结果
sorted.foreach(println)
}