python - 1024 Spider

python3 urlretrieve 


import urllib.request
url = 'http://i3.hunantv.com/p1/20150906/1637244570C.jpg'
file_name = 'F:\\img\\ym.jpg'
...
# Download the file from `url` and save it locally under `file_name`:
urllib.request.urlretrieve(url, file_name)

python md5 加密


import hashlib
import time

m = hashlib.md5()

m.update(str(time.time()).encode())

md5value=m.hexdigest()

print(md5value)

python base64 加密,encodeURIComponent


import time
from urllib import parse

tm = time.time()

print(str(tm))

tm = str(tm)[0:10]

print(tm)

import base64

print(base64.b64encode(tm.encode()).decode())

b64 = base64.b64encode(tm.encode()).decode()

print(parse.quote(b64)) # encodeURIComponent

'''
MTUwMjQyODkwMw==
MTUwMjQyNTE1Mw==
'''


草榴 Spider


config


PROXIES = { "http": "http://xxx:xxxx", "https": "http://xxx:xxxx", }

MONGO_URL = '127.0.0.1'
MONGO_DB = 'caoliu'

CLSQ = 'http://xxx.com/' # 地址就不留了
CLSQ_DOWNLOAD = 'http://www.xxx.com/download.php?'

BT_PATH = 'F:\\1024bt\\{}\\'


viidii


from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

from pyquery import PyQuery as pq

def get_b64(art_hash):
    browser = webdriver.PhantomJS(service_args=['--load-images=false', '--disk-cache=true'])
    browser.set_window_size(1400, 900)
    wait = WebDriverWait(browser, 10)

    browser.get('http://www.viidii.info/?http://www______rmdown______com/link______php?hash={}'.format(art_hash))

    wait.until(
        EC.presence_of_element_located((By.CSS_SELECTOR, 'input[type=hidden]'))
    )

    html = browser.page_source
    doc = pq(html)
    b64 = doc.find('input[type=hidden]').attr('value')
    return b64


spider


import requests
import pymongo
import os
import re
# import base64
import time
import viidii

from pyquery import PyQuery as pq
from Config import *

from urllib import parse


# db
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
global table

# 时间
import datetime
today = datetime.date.today()
one_day = datetime.timedelta(1)
yesterday = today - one_day
before_day = yesterday - one_day

yesterday_str = yesterday.strftime('%Y-%m-%d')
before_day_str = before_day.strftime('%Y-%m-%d')

# 计数器
global count

def insert_to_mongo(info):
    global count
    global table
    query_info = {'art_name': info['art_name']}
    if table.find(query_info).limit(1).count() == 0:
        table.insert(info)
        count += 1
        print('成功插入第 ', count, ' 数据:', info)
    else:
        print('重复:', info)
    '''
    1. 插入mongo 之前,info 中被加入一条_id字段
    2. python 传递参数
        对于不可变对象作为函数参数,相当于C系语言的值传递;
        对于可变对象作为函数参数,相当于C系语言的引用传递。
    3. 所以如果新插入的info 如果不flush,mongo 将会报错
    '''

def art_bt_hash(url):
    '''
    得到hash 码
    :param url: 
    :return: 
    '''
    doc = requests.get(url=url, proxies=PROXIES).content.decode('gbk')
    hash = re.search(r'rmdown\.com/link\.php\?hash=(.*?)<', doc, re.S).group(1) # 正则直接匹配
    return hash

def art_item(tr):
    '''
    得到一条(一件艺术品),获取这个art 的hash 码,对info 进行封装
    :param tr: 
    :return: 
    '''
    info = {}
    info['art_name'] = tr.find('h3').text()
    hash_url = tr.find('.tal > h3 > a').attr('href')
    art_url = '{}{}'.format(CLSQ, hash_url)
    info['art_url'] = art_url
    try:
        info['art_hash'] = art_bt_hash(art_url)
    except AttributeError as e:
        print(e.args)
        return
    except UnicodeDecodeError as e:
        print(e.args)
        return
    info['art_time'] = yesterday_str
    info['art_flag'] = '0'
    insert_to_mongo(info=info)

def next_tags(**kwargs):
    '''
    翻页
    :param kwargs: 
    :return: 
    '''
    base = kwargs['base']
    page_num = kwargs['page_num']
    url = '{}&page={}'.format(base, page_num)
    print(url)
    try:
        doc = requests.get(url=url, proxies=PROXIES).content
    except requests.exceptions.ContentDecodingError as e:
        print(e.args)
        time.sleep(2)
        next_tags(base=base, page_num=page_num)
        return
    html = pq(doc)
    trs = html.find('#ajaxtable > tbody:nth-child(2) > tr').items()

    for tr in trs:
        art_time = tr.find('div[class=f10]').text()
        if art_time == before_day_str:
            return
        if art_time == yesterday_str:
            art_item(tr)
    next_tags(base=base, page_num=page_num+1)

# 得到 hash 码,然后放入mongodb
def art_tags(**kwargs):
    url = kwargs['url']
    print(url)
    doc = requests.get(url=url, proxies=PROXIES).content
    html = pq(doc)
    trs = html.find('#ajaxtable > tbody:nth-child(2) > tr').items()
    flag = False

    for tr in trs:
        if flag:
            art_time = tr.find('div[class=f10]').text()
            if art_time == before_day_str:
                return
            if art_time == yesterday_str:
                art_item(tr)
        if tr.text() == '普通主題':
            flag = True
    next_tags(base=url, page_num=2)

def downloader(**kwargs):
    '''
    下载器
    r = requests.get(url).content
    with open(file=path, mode='wb') as f:
        f.write(r)
    :param kwargs: 
    :return: 
    '''
    url = kwargs['url']
    hash = kwargs['hash']
    r = requests.get(url).content
    print(url, r)
    path = '{}{}.torrent'.format(BT_PATH.format(yesterday_str), hash)
    try:
        with open(file=path, mode='wb') as f:
            f.write(r)
    except FileNotFoundError as e:
        print(e.args)
        return False
    print('bt -> ', path)
    return True
# bt 下载器,从mongodb 中得到hash,下载bt
def art_bt_download(**kwargs):
    global table
    query_info = kwargs['query_info']
    for item in table.find(query_info):
        art_hash = item['art_hash']
        # stamp_base64 = parse.quote(base64.b64encode(str(time.time())[0:10].encode()).decode())
        stamp_base64 = parse.quote(viidii.get_b64(art_hash=art_hash))
        url = '{}ref={}&reff={}&submit=download'.format(CLSQ_DOWNLOAD, art_hash, stamp_base64)
        if downloader(url=url, hash=art_hash):
            update_data = {'$set' : {'art_flag' : '1'}}
            table.update(spec=item, document=update_data, upsert=False)
        else:
            table.remove(item)
            print('删除一条数据...')

if __name__ == '__main__':
    global count

    # 创建文件夹
    if not os.path.exists(BT_PATH.format(yesterday_str)):
        os.makedirs(BT_PATH.format(yesterday_str))
    # 类别list
    url_dict = {'2' : '亞洲無碼原創區', '15' : '亞洲有碼原創區', '5' : '動漫原創區'}
    # 遍历
    for type, name in url_dict.items():
        count = 0
        global table
        table = db[name]
        # art_tags(url='{}thread0806.php?fid={}'.format(CLSQ, type))
        print('启动下载器...')
        # art_bt_download(query_info={'art_flag': '0'})
        art_bt_download(query_info={'art_time' : '2017-08-10'})







你可能感兴趣的:(Python,数据采集)