python 爬虫整理

import requests
from bs4 import BeautifulSoup as bs
import datetime
import json
import re
import multiprocessing as mp

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"}

第一种:返回的是json格式,直接解析

 while True:
        try:
            r = requests.get("http://yunhq.sse.com.cn:32041/v1/sh1/list/exchange/equity?select=code%2Cprev_close&order=&begin=0&end=1500", headers=headers).text
            text = json.loads(r)
            cnt = text["list"]
            break
        except:
            continue
可以用循环来过滤一两次爬取失败。

第二种:返回html,用BeautifulSoup解析

        r = requests.get(url, headers=gheaders).content
        content = bs(r, "html.parser", from_encoding='utf-8')
        text = content.find("table", attrs={"class": "quote-info"})
        tds = text.find_all("td")
        p1 = str(tds[6].find("span", class_="stock-fall").text)
        p2 = str(tds[2].find("span", class_="stock-rise").text)
 

另外可以用多进程并行爬取:

def get_close(code):
    r = requests.get(b'http://www.szse.cn/api/market/ssjjhq/getTimeData?marketId=1&code=%s' % code, headers=headers).text
    text = json.loads(r)
    px = str(text["data"]["close"])
    return code+","+px+"\n"
    syms = get_list()
    res = []
    nProcess = 2*mp.cpu_count()/3
    if nProcess > 1:
        pool = mp.Pool(nProcess)
        res = pool.map(get_close, syms)
        pool.close()
        pool.join()
    else:
        res = map(get_close, syms)
 

 

 

https://blog.csdn.net/qq_32784541/article/details/79655146

你可能感兴趣的:(爬虫)