export localroot=/放想带上去的jar包、配置的根目录
启动spark(可自己配相应参数):
spark-shell \
--name "content_baidunlp" \
--master yarn-client \
--num-executors 4 \
--executor-cores 2 \
--executor-memory 3G \
--driver-memory 5G \
--conf spark.driver.maxResultSize=10g \
--conf spark.yarn.executor.memoryOverhead=10000 \
--conf spark.serializer="org.apache.spark.serializer.KryoSerializer" \
--conf spark.shuffle.memoryFraction=0.3 \
--conf spark.sql.shuffle.partitions=1000 \
--conf spark.default.parallelism=1000 \
--files $localroot/config/hanlp.properties \
--jars $localroot/lib/hanlp-1.6.3.jar,\
$localroot/lib/netty-3.10.6.Final.jar \
进入sparK后,可随意跑程序了,如下例程:
import org.apache.spark.SparkContext
import org.apache.spark.sql.Row
import org.apache.spark.sql.{Dataset, SparkSession}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions._
import org.json.{JSONObject,JSONArray}
val sc = SparkSession.builder().enableHiveSupport().getOrCreate()
val sqlContext = sc
val conf=sc.sparkContext.getConf
val sparkContext=sc.sparkContext
import sqlContext.implicits._
val numbersRdd = sparkContext.parallelize[Int](1 to 200000).repartition(20)
val df=sparkContext.parallelize(Array("abcd","esdfa","dsfa")).toDF("str")
df.createOrReplaceTempView("tempTable")
sqlContext.sql("select substr(str,2,length(str) - 2) as newstr from tempTable").show
df.select(expr("substring(str,2,length(str)-2)").alias("newstr")).show
val df2=sparkContext.parallelize(Array("\"abcd\",\"dfasdf\",\"asdf\"","\"esdfa\",\"dsfa\",\"dfasadfa\",\"fdasdf\"","\"dasfa\",\"rewe\",\"erfgsdk\"")).toDF("str")
df2.select(explode(split(expr("substring(str,2,length(str)-2)"),",")).alias("str2")).select(regexp_replace(col("str2"),"\"","").alias("str3")).show
val df=sparkContext.parallelize(Array("[{\"a\":\"sdfa\",\"b\":\"asdf\"},{\"a\":\"bvcdf\",\"b\":\"eradf\"},{\"a\":\"hsf\",\"b\":\"reasdfad\"}]")).toDF("json")
val df2=df.map{
case Row(json:String)=>
val jo=(new JSONArray(json))
3// (0 until jo.length()).map(i=>jo.getJSONObject(i).getString("b")).mkString("&")
}.toDF("num")
df.collect.map{
case Row(json:String)=>
val jo=(new JSONArray(json))
println((0 until jo.length()).map(i=>jo.getJSONObject(null).getString("c")).mkString("&"))
}
df.sqlContext.createDataFrame(rdd, df.schema)
userDF0.sqlContext.createDataFrame(userDF, userDF0.schema)
val bf=sc.sparkContext.broadcast(Set("1","2","3"))
val df= sparkContext.parallelize(Seq(("1","2"),("3","4"),("5","6"),("3","1"))).toDF("c1","c2")
val inUserIDs = udf { (x: String,y:String) =>
if(bf.value.contains(x)) x
else if(bf.value.contains(y)) y
else "deleteUser"
}
df.withColumn("user_id", inUserIDs(df("c1"),df("c2"))).filter($"user_id".notEqual("deleteUser"))
val matVec = Seq(Vectors.dense(5,0,5), Vectors.dense(0,10,0), Vectors.dense(5,0,5))
val matRDD = new IndexedRowMatrix(sc.sparkContext.parallelize(matVec).zipWithIndex().map{case (vec, index) => IndexedRow(index, vec)})
matRDD.columnSimilarities()
matRDD.rows.collect.foreach{println}
matRDD.columnSimilarities().entries.collect.foreach{println}
import org.apache.spark.SparkContext
import org.apache.spark.sql.Row
import org.apache.spark.sql.{Dataset, SparkSession}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions._
import org.json.{JSONObject,JSONArray}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.{IndexedRow, IndexedRowMatrix}
val sc = SparkSession.builder().enableHiveSupport().getOrCreate()
val sqlContext = sc
val conf=sc.sparkContext.getConf
val sparkContext=sc.sparkContext
import sqlContext.implicits._
def innerProduct(v1:org.apache.spark.mllib.linalg.Vector,v2:org.apache.spark.mllib.linalg.Vector):Double={
var output=0.0
v1.foreachActive{
case x=>
//println(x._1,x._2)
output=output+x._2*v2.apply(x._1)
}
output
}
def cosDistance(v1:org.apache.spark.mllib.linalg.Vector,v2:org.apache.spark.mllib.linalg.Vector):Double={
var output=0.0
val d1=innerProduct(v1,v1)
val d2=innerProduct(v2,v2)
if(d1==0 || d2==0) output=0.0
else output=innerProduct(v1,v2)/(math.sqrt(d1)*math.sqrt(d2))
output.formatted("%.4f").toDouble
}
val v1=Vectors.sparse(4,Array(1,2,3),Array(0.1,0.2,0.3))
val v2=Vectors.sparse(4,Array(0,1),Array(0.5,1))
val v3=Vectors.sparse(4,Array(0,1,3),Array(0.5,1,1))
val rdd=sc.sparkContext.parallelize(Seq((0,v1),(1,v2),(2,v3)))
val map=Map((0,v1),(1,v2),(2,v3))
rdd.repartition(3).map{case (i,v)=> (i,(0 to 2).map(x=>x+":"+cosDistance(v,map(x))).mkString("#")) }.collect