Python爬虫脚本,利用Beautifulfly爬取动态网页网页(源码)

'''
Created on 2019年12月23日

@author: Zhangzhiwei
'''
import datetime
import threading
import cx_Oracle
import urllib.request
import uuid
import pyamf
from pyamf import remoting
from pyamf.flex import messaging

import os
os.environ['NLS_LANG'] = 'SIMPLIFIED CHINESE_CHINA.UTF8'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}

class HqPara:

    def __init__(self, breedInfo, provice, breedInfoDl, marketInfo):
        self.marketInfo = marketInfo
        self.breedInfoDl = breedInfoDl
        self.breedInfo = breedInfo
        self.provice = provice


class BreedInfoPo():

    def __init__(self, children0, item_code, item_name, children1):
        self.parentcode = None
        self._children = children0
        self.itemcode = item_code  # String 000000
        self.itemname = item_name  # String 全部
        self.children = children1


class ProvincePo():

    def __init__(self, item_code, item_name):
        self._children = None
        self.itemcode = item_code  # String 000000
        self.itemname = item_name  # String 全国
        self.children = None


class BreedInfo():

    def __init__(self, item_code, item_name):
        self.parentcode = None
        self._children = None
        self.itemcode = item_code  # String AM
        self.itemname = item_name  # String 水产品
        self.children = None


class PMarketInfo():

    def __init__(self, market_code, market_name):
        self.marketCode = market_code
        self.marketName = market_name  # String 000000/AM


# registerClassAlias("personTypeAlias", Person);
# 注册自定义的Body参数类型,这样数据类型com.itown.kas.pfsc.report.po.HqPara就会在后面被一并发给服务端(否则服务端就可能返回参数不是预期的异常Client.Message.Deserialize.InvalidType)
pyamf.register_class(HqPara, alias='com.itown.kas.pfsc.report.po.HqPara')
pyamf.register_class(BreedInfoPo, alias='com.itown.kas.pfsc.report.po.BreedInfoPo')
pyamf.register_class(ProvincePo, alias='com.itown.kas.pfsc.report.po.ProvincePo')
pyamf.register_class(BreedInfo, alias='com.itown.kas.pfsc.report.po.BreedInfoPo')
pyamf.register_class(PMarketInfo, alias='com.itown.kas.pfsc.report.po.PMarketInfo')


def construct_request(product_code, product_name, provice_code, provice_name, breed_code, breed_name, market_code, market_name, page_num, total_num):
    breedInfo = BreedInfoPo(None, product_code, product_name, None)
    provice = ProvincePo(provice_code, provice_name)
    breedInfoDl = BreedInfo(breed_code, breed_name)
    marketInfo = PMarketInfo(market_code, market_name)

# 构造flex.messaging.messages.RemotingMessage消息
    msg = messaging.RemotingMessage(messageId=str(uuid.uuid1()).upper(),
                                    clientId=str(uuid.uuid1()).upper(),
                                    operation='getHqSearchData',
                                    destination='reportStatService',
                                    timeToLive=0,
                                    timestamp=0)
# 第一个是查询参数,第二个是页数,第三个是控制每页显示的数量(默认每页只显示15条)但爬取的时候可以一下子设置成全部的数量
# 构造请求数据
    msg.body = [HqPara(breedInfo, provice, breedInfoDl, marketInfo), str(page_num), str(total_num)]
    print (str(page_num))
    print (str(total_num))
    msg.headers['DSEndpoint'] = None
    msg.headers['DSId'] = str(uuid.uuid1()).upper()
    # 按AMF协议编码数据
    req = remoting.Request('null', body=(msg,))
    env = remoting.Envelope(amfVersion=pyamf.AMF3)
    env.bodies = [('/1', req)]
    data = bytes(remoting.encode(env).read())
    return data


# 返回一个请求的数据格式
def getResponse(data):
    http_handler = urllib.request.HTTPHandler()
    url = 'http://jgsb.agri.cn/messagebroker/amf'
    req = urllib.request.Request(url, data, headers={'Content-Type': 'application/x-amf'})
    # 解析返回数据
    opener = urllib.request.build_opener(http_handler)
    return opener.open(req).read()


def getContent(response):
    amf_parse_info = remoting.decode(response)
    # 数据总条数
    total_num = amf_parse_info.bodies[0][1].body.body[3]
    info = amf_parse_info.bodies[0][1].body.body[0]
    print (info)
    return total_num, info

def func():
    conn = cx_Oracle.connect('username/[email protected]:8082/ORCL')
    print("数据库连接上了")
    cursor_oracle = conn.cursor()
    num = 0;
    
    b = '元/公斤';
    # 获取数据量 
    reqData = construct_request('000000', '全部', '370000', '山东省', '000000', '全部', '3707056', '寿光物流园', 1, 2)
    rep = getResponse(reqData)
    total_num, info = getContent(rep)
    # 一次请求完成
    reqData = construct_request('000000', '全部', '370000', '山东省', '000000', '全部', '3707056', '寿光物流园', 1, total_num)
    rep = getResponse(reqData)
    total_num, info = getContent(rep)
    for record in info:
        print (record)
        sql = "insert into PRODUCTPRICE(ID,NAME,PRICE,UNIT,AREA,INDATE) values ('" + str(uuid.uuid1()) + "','" + record["farmProduceName"] + "','" + str(record["averagePrice"]) + "','" + b + "','" + record["marketName"] + "','" + str(record["reportDate"]) + "')"
        print(sql)
        cursor_oracle.execute(sql)
        num = num + 1
        if(num > 100):
            conn.commit()
            num = 0
    conn.commit()
    cursor_oracle.close()
    conn.close()
    
    timer = threading.Timer(86400, func)
    timer.start()
    
# 获取现在时间
now_time = datetime.datetime.now()
# 获取明天时间
next_time = now_time + datetime.timedelta(days=+1)
next_year = next_time.date().year
next_month = next_time.date().month
next_day = next_time.date().day
# 获取明天3点时间
next_time = datetime.datetime.strptime(str(next_year)+"-"+str(next_month)+"-"+str(next_day)+" 03:00:00", "%Y-%m-%d %H:%M:%S")
# # 获取昨天时间
# last_time = now_time + datetime.timedelta(days=-1)

# 获取距离明天3点时间,单位为秒
timer_start_time = (next_time - now_time).total_seconds()
print(timer_start_time)
#定时器,参数为(多少时间后执行,单位为秒,执行的方法)
timer = threading.Timer(10, func)
timer.start()

你可能感兴趣的:(python,爬虫)