1.首先还是那句话,如有雷同,当我抄你的。
2.疫情影响,想出去玩也不行,有点抑郁。
3.19年年底,换工作了,大的硬核技术上还没啥进阶的,还是工程落地解决实际问题为主了,多点经验罢了,这些事儿多少有丢丢boring。
4.主要就是,利用机器学习思路来解决产品中实际的问题,比如预测流失、活跃、投诉等等,无非就是拆解问题、找特征、构建样本、调模型。
5.啃老本,也有点难受,毕竟,乐趣来自于进步,所以,还是得找点乐子
一个是抓取的时候加入代理,一个是利用chromedriver来应对滚动刷新的情况
关键词:代理,chromedriver
import urllib.request as req
import urllib
#带代理的爬取,例子如下
proxy = req.ProxyHandler({'http': r'http://username:password@host:port'})
auth = req.HTTPBasicAuthHandler()
opener = req.build_opener(proxy, auth, req.HTTPHandler)
req.install_opener(opener)
conn = req.urlopen('https://xxx')
return_str = conn.read()
#利用chromedriver来爬,模拟浏览器行为,可以滚动刷新,可以看到页面情况
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import random
driver=webdriver.Chrome("/xxx/xxx/chromedriver")
def getPhoneUrl(driver_use, cls_page_url):
src_url = cls_page_url
driver_use.get(src_url)
for i in range(10 + 1):
driver_use.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(1)
html=driver_use.page_source
soup=BeautifulSoup(html,'lxml')
app_div = soup.find_all('h4', attrs={'class':'title'})
urls = []
for app in app_div:
try:
phone_a = app.find('a')
print(phone_a['href'])
urls += [phone_a['href']]
except:
print(app)
#print('getinfo=%s' % page, urls)
return urls
ps:face_recognition安装google搜一下就好,有点小坑
关键词imap_unordered
import cv2
import numpy as np
def rotate_bound(image, angle):
# grab the dimensions of the image and then determine the
# center
(h, w) = image.shape[:2]
(cX, cY) = (w // 2, h // 2)
# grab the rotation matrix (applying the negative of the
# angle to rotate clockwise), then grab the sine and cosine
# (i.e., the rotation components of the matrix)
M = cv2.getRotationMatrix2D((cX, cY), -angle, 1.0)
cos = np.abs(M[0, 0])
sin = np.abs(M[0, 1])
# compute the new bounding dimensions of the image
nW = int((h * sin) + (w * cos))
nH = int((h * cos) + (w * sin))
# adjust the rotation matrix to take into account translation
M[0, 2] += (nW / 2) - cX
M[1, 2] += (nH / 2) - cY
# perform the actual rotation and return the image
return cv2.warpAffine(image, M, (nW, nH))
#因为照片人脸可能歪的,face_recognition可能识别失败,旋转再识别
def get_encode(image):
for i in range(0,7):
image_1 = rotate_bound(image,0+30*i)
face_encoding = face_recognition.face_encodings(image_1)
if len(face_encoding) > 0:
break
image_1 = rotate_bound(image,0-30*i)
face_encoding = face_recognition.face_encodings(image_1)
if len(face_encoding) > 0:
break
return face_encoding
import face_recognition
def face_distance(face_encodings, face_to_compare):
if len(face_encodings) == 0:
return np.empty((0))
return np.linalg.norm(face_encodings - face_to_compare)
#一个人可能多张照片,取一个能encode的
def path_to_encode(path_i_list,path_samp_pic="../xxfile"):
res = "-"
for path_i in path_i_list:
path_detail = path_samp_pic + "/" + path_i
image_1 = face_recognition.load_image_file(path_detail)
encoding_1 = get_encode(image_1)
if len(encoding_1) > 0 and len(encoding_1[0]) > 1:
res = path_i + "\t" + str(encoding_1[0])
break
return res
import multiprocessing
cores = multiprocessing.cpu_count()
pool = multiprocessing.Pool(processes=int(cores/2))
with open(path_vec_save + name + str(num_1), 'w+', encoding='utf-8') as ff:
name_all_pic_values = name_all_pic.values()//字典的值
for retval in pool.imap_unordered(path_to_encode, name_all_pic_values, chunksize = 40):
num_1 += 1
if num_1%1000 == 0:
print('done %d/%d\r' % (num_1, len(name_all_pic_values)))
print(retval)
print(time.time()-time1)
问题不大,就是特征得处理成对应的向量格式,示例Vectors.dense(all_feac.map(_.toDouble).toArray)
import org.apache.spark.ml
import org.apache.spark.ml.PipelineStage
import org.apache.spark.ml.classification.{GBTClassificationModel, GBTClassifier}
import org.apache.spark.ml.evaluation.{MulticlassClassificationEvaluator, RegressionEvaluator}
import org.apache.spark.ml.linalg.Vectors
val train_data_004all = spark.read.parquet("/xxx1")
val test_data = spark.read.parquet("/xxx2")
val stages = new mutable.ArrayBuffer[PipelineStage]()
val dt = new GBTRegressor(). //GBTRegressor() GBTClassifier()
setLabelCol("label"). //输入label
setFeaturesCol("feature"). //输入features vector
setMaxIter(10). //最大迭代次数
setImpurity("entropy"). //or "gini"
setMaxDepth(3). //决策树的深度
setStepSize(0.2). //范围是(0, 1]
setSeed(1234)
stages += dt
val pipeline = new Pipeline().setStages(stages.toArray)
// Fit the Pipeline.
val startTime = System.nanoTime()
val pipelineModel = pipeline.fit(train_data)
val elapsedTime = (System.nanoTime() - startTime) / 1e9
println(s"Training time: $elapsedTime seconds")
//val gbtModel = pipelineModel.stages.last.asInstanceOf[GBTClassificationModel]
//val predictions = pipelineModel.transform(train_data)
val df_test_pred = pipelineModel.transform(test_data)
//看特征的importance
val gbtmodel: GBTRegressionModel = pipelineModel.stages(0).asInstanceOf[GBTRegressionModel]
val model_imp = gbtmodel.featureImportances.toArray
val model_imp_arr = mutable.ArrayBuffer[Tuple2[Double,Double]]()
for(num <- 0 until model_imp.length){
model_imp_arr.append((num.toDouble, model_imp(num)))
}
val model_imp_arr_order = model_imp_arr.toArray.sortWith((s1:Tuple2[Double,Double], s2:Tuple2[Double,Double]) => s1._2 > s2._2)
有点颓
荒废了的钢琴课我要重启了
买了个手绘板,我去B站/youtube学画画了
话说有gpt-2作文章摘要的例子,能不能操作一波小说->摘要->匹配影视图片,生成那些自媒体的速读系列啊
说不定啥时候,我真的行动了
end