3.1GB of 3GB physical memory used; Killing container
if __name__ == "__main__":
# ts: power
quotes = pd.DataFrame(
{
"time": [
pd.Timestamp("2016-05-25 13:30:00.023"),
pd.Timestamp("2016-05-25 13:30:00.023"),
pd.Timestamp("2016-05-25 13:30:00.030"),
pd.Timestamp("2016-05-25 13:30:00.041"),
pd.Timestamp("2016-05-25 13:30:00.048"),
pd.Timestamp("2016-05-25 13:30:00.049"),
pd.Timestamp("2016-05-25 13:30:00.072"),
pd.Timestamp("2016-05-25 13:30:00.075")
],
"id": [
"GOOG",
"MSFT",
"MSFT",
"MSFT",
"GOOG",
"AAPL",
"GOOG",
"MSFT"
],
"bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],
"ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]
}
)
trades = pd.DataFrame(
{
"time": [
pd.Timestamp("2016-05-25 13:30:00.023"),
pd.Timestamp("2016-05-25 13:30:00.038"),
pd.Timestamp("2016-05-25 13:30:00.048"),
pd.Timestamp("2016-05-25 13:30:00.048"),
pd.Timestamp("2016-05-25 13:30:00.048")
],
"id": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],
"price": [51.95, 51.95, 720.77, 720.92, 98.0],
"quantity": [75, 155, 100, 100, 100]
}
)
df_trades = spark.createDataFrame(trades)
print(df_trades.schema["time"].dataType)
df_quotes = spark.createDataFrame(quotes)
s_time = time.time()
# df_quotes = df_quotes.withColumn("time", to_timestamp((pyspark.sql.functions.col("time"))))
# print(df_quotes.schema["time"].dataType)
# 14s; +trade_count 18s
def asof_join(l, r):
return pd.merge_asof(l, r, on="time", by="id")
# datetime64; DateType(没有分钟信息)
# "time DateType, id string, bid double, ask double, price double, quantity double"
df_trades.groupby("id").cogroup(df_quotes.groupby("id")).applyInPandas(
asof_join, schema=StructType([
StructField("time", TimestampType(), True),
StructField("id", StringType(), True),
StructField("price", DoubleType(), True),
StructField("quantity", DoubleType(), True),
StructField("bid", DoubleType(), True),
StructField("ask", DoubleType(), True)
])
).show()
print("4--- %s seconds ---", time.time() - s_time)
3.两种思路对比(解释一下:查cpu_util时怕influxdb数据崩掉, 所以没用并行查询的方式,用了for循环)
1.查power
2.for循环查cpu_util , 并进行power, cpu_util 拼接join操作, 得到 df_power_cpu
1.查power
2.for循环查cpu_util (for里面做的事情尽可能少;)
3.超大dataframe join 操作,得到 df_power_cpu;
思路1-> 思路2, 20+mins–> 6mins, 提升还是很明显的;
4.毫秒级的数字:unix timestamp(big int)转为TimestampType
举例:1464154230000 --》2016-05-25 13:30:00
df_explode = df_explode.withColumn("time", from_unixtime(df_explode["time"].cast('bigint') / 1000))
#对分钟级取整
df_explode = df_explode.withColumn("time",
date_trunc("minute", from_unixtime(df_explode["time"].cast('bigint') / 1000)))