咱们就不说废话了,直接上完整的源码
def startGetData(self): self.url = "https://nex.163.com/q" body = self.getBody() self.parse_url(self.url, body)
这个是启动函数
def getBody(self): body = """{ "adunit": { "category": "FOCUS2", "app_version": "34.0", "city": "", "app": "7A16FBB6", "location": "1,2,20,21,22,23,24,25,26,27,28,29,30,31,10", "blacklist": "", "province": "" }, "ext_param": {}, "device": { "mac": "", "dt": "iPhone 6", "idfa": "18454932-A441-4720-8973-776284A58B7F", "mcc": "", "longitude": "113.3360799435621", "isp": "cm", "latitude": "23.12629215782341", "dq": "750x1334", "os": "ios", "imei": "", "galaxy_tag": "C622E64A-3478-40E6-99EC-32AB80BE4D4B", "city_code": "440106", "network_status": "wifi", "location_type": 1, "udid": "" }, "version": "9.1.3", "urs": "2qq2Z9GdG9+N5ruzJyW1p8aKM2B+F3Tmv+EBsxR5PT4=", "is_test": false }""" return body;这个是接口需要携带的参数
def parse_url(self, url, body): response = requests.post(url, data=body, verify=False self.parse_json(response.content.decode("utf-8"))//网络请求并返回json字符窜
def parse_json(self, jsonStr): DataInfo.time = Util().getCurrTime() try: json_list = (json.loads(jsonStr))["ads"] for json_str in json_list: adid = json_str["adid"] if adid.startswith("yx"): self.savaDataInfo(json_str) except KeyError as x: print(x)
解析json数据并筛选游戏广告
def savaDataInfo(self, json_str): DataInfo.title = json_str["title"] DataInfo.type = 1 DataInfo.channel = "wangyixinwen" DataInfo.appdownload = json_str["relatedActionLinks"][0]["url"] self.saveBitmapOrPath(json_str) DataInfo.device_type = "ios" DataInfo.source_type = 1 MySqlManager().insert_inspection_list(4)
将广告数据保存到mysql数据库
def saveBitmapOrPath(self, json_str): bitmap = {} bitmap_path = {} filename = "pic1_" + str(int(time.time() * 1000000)) + ".jpg" bitmap_path["pic_path1"] = self.path + filename DataInfo.pic_path = bitmap_path bitmap["pic1"] = json_str["resources"][0]["urls"][0] Util().save_img(bitmap["pic1"], filename, self.path) DataInfo.pic_list = bitmap
将游戏广告图片下载到服务器
def getFilePath(self, plat): return "/upload/" + plat + "/" + str(time.strftime("%Y-%m") + "/") def save_img(self, limg_url, filename, path): # 保存图片到磁盘文件夹 file_path中,默认为当前脚本运行目录下的 book\img文件夹 try: file_path = "/home/tianchao/python/materialscript" + path if not os.path.exists(file_path): print('文件夹', file_path, '不存在,重新建立') os.makedirs(file_path) pic = requests.get(limg_url, timeout=5) # 超时异常判断 5秒超时 file_name = file_path + filename print(file_name) fp = open(file_name, 'wb') fp.write(pic.content) # 写入图片 fp.close() except IOError as e: print('文件操作失败:', e) except Exception as e: print('错误 :', e)
保存图片代码
def __init__(self): self.product_detail = "product_detail" self.path = Util().getFilePath("dataeye") self.pic1_path = "" self.pic2_path = "" self.pic3_path = "" try: self.conn = pymysql.connect(host='123.207.58.110', port=3306, user='root', password='iwkSlb0yt=pu', db="material_database", charset='utf8', cursorclass=pymysql.cursors.DictCursor ) except OperationalError as e: print(e) # 将产品详情插入数据库 def insert_product_detail(self, product_id, json_obj): table_name = "product_detail" if self.isProductIdExits(table_name, product_id) == 1: update_sql = "UPDATE " + table_name + " SET company_num=%d,days=%d,first_seen='%s',labels='%s',last_seen='%s'," \ "logo_url='%s',media_list='%s',media_num=%d,product_id=%d,product_name='%s',updated_at='%s' WHERE product_id=%d" \ % (json_obj["companyNum"], json_obj["days"], json_obj["firstSeen"], json_obj["labels"], json_obj["lastSeen"], json_obj["logoURL"], json.dumps(json_obj["mediaList"]), json_obj["mediaNum"], json_obj["productId"], json_obj["productName"], self.getCurrentTime(), product_id) self.execute(update_sql) else: insert_sql = "INSERT INTO " + table_name + "(company_num,days,first_seen,labels,last_seen,logo_url,media_list,media_num,product_id,product_name,created_at)" \ " VALUES (%d ,%d ,'%s','%s','%s','%s','%s',%d ,%d ,'%s','%s')" \ % (json_obj["companyNum"], json_obj["days"], json_obj["firstSeen"], json_obj["labels"], json_obj["lastSeen"], json_obj["logoURL"], json.dumps(json_obj["mediaList"]), json_obj["mediaNum"], json_obj["productId"], json_obj["productName"], self.getCurrentTime()) self.execute(insert_sql) # 将产品详情页图标数据插入数据库 def insert_product_detail_table(self, product_id, json_obj): table_name = "product_detail_table" if self.isProductIdExits(table_name, product_id) == 1: update_sql = "UPDATE " + table_name + " SET ad_creative_list='%s',ad_creative_list='%s',xlabel='%s',ad_count_last_year=%d,product_id=%d,updated_at='%s' WHERE product_id=%d" \ % (json.dumps(json_obj["adCreativeList"]), json.dumps(json_obj["adMaterialList"]), json.dumps(json_obj["xlabel"]), json_obj["adCountLastYear"], product_id, self.getCurrentTime(), product_id) self.execute(update_sql) else: insert_sql = "INSERT INTO " + table_name + "(ad_creative_list,ad_material_list,xlabel,ad_count_last_year,product_id,created_at)" \ " VALUES ('%s','%s','%s',%d,%d,'%s')" \ % ( json.dumps(json_obj["adCreativeList"]), json.dumps(json_obj["adMaterialList"]), json.dumps(json_obj["xlabel"]), json_obj["adCountLastYear"], product_id, self.getCurrentTime()) self.execute(insert_sql) # 将图片素材插入数据库 def insert_product_detail_pic(self, product_id, json_obj): self.savePic(json_obj) table_name = "product_pic_material_list" material_id = json_obj["materialId"] if self.isMaterialIdExits(table_name, material_id) == 1: update_sql = "UPDATE " + table_name + " SET company_num=%d,creative_num=%d,first_seen='%s',h=%d,last_days=%d,last_seen='%s',material_id=%d,material_type=%d," \ "media_list='%s',new='%s',pic1='%s',pic2='%s',pic3='%s',product_num=%d,video='%s',w=%d,product_id=%d,video='%s' WHERE material_id=%d" \ % (json_obj["companyNum"], json_obj["creativeNum"], json_obj["firstSeen"], json_obj["h"], json_obj["lastDays"], json_obj["lastSeen"], material_id, json_obj["materialType"], json.dumps(json_obj["mediaList"]), json_obj["new"], json_obj["pic1"], json_obj["pic2"], json_obj["pic3"], json_obj["productNum"], json_obj["video"], json_obj["w"], product_id, self.getCurrentTime(), material_id) self.execute(update_sql) else: insert_sql = "INSERT INTO " + table_name + "(company_num,creative_num,first_seen,h,last_days,last_seen,material_id,material_type,media_list,new,pic1,pic2,pic3" \ ",product_num,video,w,product_id,created_at,pic1_path,pic2_path,pic3_path)" \ " VALUES (%d,%d,'%s',%d,%d,'%s',%d,%d,'%s','%s','%s','%s','%s',%d,'%s',%d,%d,'%s','%s','%s','%s')" \ % ( json_obj["companyNum"], json_obj["creativeNum"], json_obj["firstSeen"], json_obj["h"], json_obj["lastDays"], json_obj["lastSeen"], json_obj["materialId"], json_obj["materialType"], json.dumps(json_obj["mediaList"]), json_obj["new"], json_obj["pic1"], json_obj["pic2"], json_obj["pic3"], json_obj["productNum"], json_obj["video"], json_obj["w"], product_id, self.getCurrentTime(), self.pic1_path, self.pic2_path, self.pic3_path) self.execute(insert_sql) def savePic(self, json_obj): pic1 = json_obj["pic1"] pic2 = json_obj["pic2"] pic3 = json_obj["pic3"] if pic1.strip() != '': filename = "pic1_" + str(int(time.time() * 1000000)) + ".jpg" self.pic1_path = self.path + filename Util().save_img(pic1, filename, self.path) if pic2.strip() != '': filename = "pic2_" + str(int(time.time() * 1000000)) + ".jpg" self.pic2_path = self.path + filename Util().save_img(pic2, filename, self.path) if pic3.strip() != '': filename = "pic3_" + str(int(time.time() * 1000000)) + ".jpg" self.pic3_path = self.path + filename Util().save_img(pic3, filename, self.path) def isProductIdExits(self, table_name, product_id): query_sql = "select *from " + table_name + " where product_id = " + str(product_id) cursor = self.conn.cursor() result = cursor.execute(query_sql) print(result) self.conn.commit() return result def isMaterialIdExits(self, table_name, material_id): query_sql = "select *from " + table_name + " where material_id = " + str(material_id) cursor = self.conn.cursor() result = cursor.execute(query_sql) print(result) self.conn.commit() return result def insert_inspection_list(self, table_id): sql = "INSERT INTO " + self.getTableName( table_id) + "(title,app_download,time,channel,type,content,gif,video,source_type,pic_list,pic_path,device_type,material_size,app_name,created_at,updated_at)" \ " VALUES ('%s','%s','%s','%s',%d,'%s','%s','%s',%d,'%s','%s','%s','%s','%s','%s','%s')" \ % (DataInfo.title, DataInfo.app_download, DataInfo.time, DataInfo.channel, DataInfo.type, DataInfo.content, json.dumps(DataInfo.gif), json.dumps(DataInfo.video), DataInfo.source_type, json.dumps(DataInfo.pic_list), json.dumps(DataInfo.pic_path), DataInfo.device_type, DataInfo.material_size, DataInfo.app_name, self.getCurrentTime(), self.getCurrentTime()) cursor = self.conn.cursor() cursor.execute(sql) self.conn.commit() def getCurrentTime(self): return str(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) def getTableName(self, table_id): return "material_" + str(table_id % 10) def execute(self, sql): cursor = self.conn.cursor() cursor.execute(sql) self.conn.commit() def close(self): self.conn.close()
保存数据库代码
这里把爬取网易平台新闻和广告的代码贡献出来,希望可以帮到大家