【项目总结】雪球爬虫完结(附Snowball模块代码)

作为https://blog.csdn.net/CY19980216/article/details/82770410的一个完结。

暂时先把代码搬上来,注释已经写的很详细了。顺带做一个备份。

目前除了调仓记录的获取外其他都已经可以用多进程进行,由于采用了多进程,速度有明显提升。目前测试结果为1500000只组合数据爬取需要200分钟,获取10000只优质组合的净值数据需要不到300分钟。

问题仍然在需要登录才能访问的调仓数据获取,由于需要登录,多进程几乎不能起到加速的作用(该sleep的间隔还是要sleep)。后来我想了想我们的项目可能不需要一次获取大数量的组合调仓数据,如果这样的话用一分钟获取一个组合的调仓数据也无所谓了。

有空我写个User Guide,最近确实有点忙碌了。

#-*- coding:UTF-8 -*-
import os
import re
import json
import time
import math
import urllib
import pandas
import random
import requests
import datetime

from IP import IP
from bs4 import BeautifulSoup
from DBConnector import DBConnector
from multiprocessing import Process,Manager,Lock

class Snowball():
	def __init__(self,
		userAgent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0",   
		areaCode="86",
		telephone="国内手机号码",
		initialUID="5171159182",									
	):																	 # 构造函数
		print("Snowball类初始化...")
		""" 可能的传入参数设为类成员变量 """
		self.userAgent = userAgent
		self.areaCode = areaCode
		self.telephone = telephone
		self.initialUID = initialUID									 # 选择从“玩赚组合”官方账号起步(400关注1500W粉丝)
		""" 基本不会变动的默认的常数设为类成员变量 """
		self.userFolder = "User"										 # 储存用户数据的文件夹
		self.historyFolder = "History"									 # 储存历史数据的文件夹
		self.netFolder = "Net"											 # 储存净值数据的文件夹
		self.portfolioFolder = "Portfolio"								 # 储存组合数据的文件夹
		self.UIDLog = "UIDs.txt"										 # 记录已经获取的雪球用户ID的栈
		self.goodUserLog = "goodUID.txt"								 # 记录大V的UID与粉丝数
		self.userLog = "Users.txt"										 # 记录雪球用户数据
		self.errorLog = "Errors.txt"									 # 记录类方法运行中的错误
		self.portfolioLog = "Portfolios.txt"							 # 记录表现好的组合
		self.mainURL = "https://xueqiu.com"								 
		self.ipURL = "http://www.xicidaili.com"			
		self.portfolioURL = "https://xueqiu.com/P/{}"			 
		self.loginURL = "https://xueqiu.com/snowman/login"		
		self.userURL = "https://xueqiu.com/cubes/list.json?user_id={}&_={}"
		self.netURL = "https://xueqiu.com/cubes/nav_daily/all.json?cube_symbol={}"
		self.queryURL = "https://xueqiu.com/cube/search.json?q={}&count={}&page={}"	
		self.codeURL = "https://xueqiu.com/account/sms/send_verification_code.json"
		self.followerURL = "https://xueqiu.com/friendships/followers.json?uid={}&pageNo={}"
		self.followeeURL = "https://xueqiu.com/friendships/groups/members.json?uid={}&page={}&gid=0"
		self.historyURL = "https://xueqiu.com/cubes/rebalancing/history.json?cube_symbol={}&count={}&page={}"
		self.ipRegulation = r"(([1-9]?\d|1\d{2}|2[0-4]\d|25[0-5]).){3}([1-9]?\d|1\d{2}|2[0-4]\d|25[0-5])"
		self.cookie = "AQAAAKTCMDO5LQkA/wFeZQIWGR34f/iG; xq_a_token=6125633fe86dec75d9edcd37ac089d8aed148b9e; xq_a_token.sig=CKaeIxP0OqcHQf2b4XOfUg-gXv0; xq_r_token=335505f8d6608a9d9fa932c981d547ad9336e2b5; xq_r_token.sig=i9gZwKtoEEpsL9Ck0G7yUGU42LY; u=471544938460796; Hm_lvt_1db88642e346389874251b5a1eded6e3=1544938461; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1544938461; device_id=8811e70b46b0adaa9496184d828c6f1d; _ga=GA1.2.1956277879.1544938463; _gid=GA1.2.679564619.1544938463; _gat_gtag_UA_16079156_4=1"
		
		self.historyRecord = {											 # 调仓记录csv的初始化
			"PortCode":[],												 # 组合名称
			"Updated":[],												 # 更新时间
			"SecuCode":[],												 # 股票代码
			"StockName":[],												 # 股票名称
			"PrevWeight":[],											 # 更新前的仓位
			"TargetWeight":[],											 # 更新后的仓位
		}
		self.netRecord = {												 # 净值记录csv的初始化
			"PortCode":[],												 # 组合名称
			"NavDate":[],												 # 净值日期
			"Nav":[],													 # 净值数据
		}
		self.selectAttributes = {										 # 爬取雪球用户时抽选的用户字段(以及它们即将存入数据库的字段设计)
			"id":"varchar(10)",
			"friends_count":"int(32)",
			"followers_count":"int(32)",
			"gender":"varchar(5)",
			"cube_count":"int(16)",
			"stocks_count":"int(16)",
		}
		self.portfolioRecord = {
			"symbol":[],
			"market":[],
			"daily_gain":[],											 # 
			"monthly_gain":[],
			"annualized_gain_rate":[],
			"net_value":[],
			"owner_id":[],
		}
		""" 必要的变量初始化定义与操作 """
		self.workSpace = os.getcwd()									 # 获取当前工作空间(考虑到可能会用os改变当前路径)
		self.IPs = IP(headers=userAgent).get_wn_IP()					 # 获取一个新的IP池
		self.session = requests.Session()								 # 类Session用于非多进程爬虫(多进程爬虫在目标函数里自行生成各自的Session)
		self.myHeaders = {
			"User-Agent":self.userAgent,
		}
		self.session.headers = self.myHeaders
		self.session.get(self.mainURL)
		self.lock = Lock()						 
		if not os.path.exists(self.userFolder):							 # 新建文件夹以存储用户数据
			print("新建文件夹以存储调仓与用户有关的数据...")
			os.mkdir("{}\\{}".format(os.path.abspath(os.curdir),self.userFolder))
			print("正在新建文本以存储已经爬取到的用户ID...")
			with open("{}\\{}".format(self.userFolder,self.UIDLog),"a") as f:
				f.write("{}\n".format(self.initialUID))
		if not os.path.exists(self.historyFolder):						 # 新建文件夹以存储调仓历史数据
			print("正在新建文件夹以存储调仓历史数据...")
			os.mkdir("{}\\{}".format(os.path.abspath(os.curdir),self.historyFolder))
		if not os.path.exists(self.netFolder):							 # 新建文件夹以存储净值数据
			print("正在新建文件夹以存储组合净值数据...")
			os.mkdir("{}\\{}".format(os.path.abspath(os.curdir),self.netFolder))
		if not os.path.exists(self.portfolioFolder):					 # 新建文件夹以存储组合数据
			print("正在新建文件夹以存储组合净值数据...")
			os.mkdir("{}\\{}".format(os.path.abspath(os.curdir),self.portfolioFolder))

	def login(self):													 # 通过手动输入短信验证码的方式登录(后来觉得自动识别短信验证码弄巧成拙)
		self.session.headers["Cookie"] = self.cookie					 # 注意不带cookie现在不能登录,带了cookie又不能访问调仓记录(所以登录时带上cookie,登完就把cookie扔了)
		codeData = {													 # 获取验证码时提交的表单
			"areacode":self.areaCode,
			"telephone":self.telephone,
		}
		formData = {													 # 登录时提交的表单
			"areacode":self.areaCode,
			"telephone":self.telephone,
			"remember_me":"true",
		}
		r = self.session.post(self.codeURL,codeData)					 # 发送验证码
		while r.text[2]=="e":
			print("获取短信验证码失败!\t{}".format(r.text))
			input("继续获取验证码?")
			r = self.session.post(self.codeURL,codeData)				 # 发送验证码
		formData["code"] = input("请输入手机号为{}的验证码:".format(self.telephone))
		r = self.session.post(self.loginURL,formData)					 # 验证码登录
		while r.text[2]=="e":
			r = self.session.post(self.loginURL,formData)				 # 验证码登录
			print("短信验证码登录失败!\t{}".format(r.text))
		print("登录成功!")
		self.session.headers = {"User-Agent":self.userAgent}

	def parse_follower(self,UID,UIDs,
		repeatCheck=True,
		threshold=1000
	):																	 # 获取指定ID的用户的粉丝列表用户信息
		try:followerHTML = self.session.get(self.followerURL.format(UID,1)).text
		except:
			print("无法访问ID为{}的用户第1页粉丝列表!".format(UID))
			with open(self.errorLog,"a") as f:
				f.write("无法访问ID为{}的用户第1页粉丝列表!\t{}\n".format(UID,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
			return False
		if not followerHTML[2]=="c":									 # 排除访问出错
			print("在爬取ID为{}的用户第1页粉丝列表时发生错误!".format(UID))
			with open(self.errorLog,"a") as f:
				f.write("在爬取ID为{}的用户第1页粉丝列表时发生错误!\t{}\n".format(UID,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
			return False							 
		followerJson = json.loads(followerHTML)							 # 粉丝页面Json
		followerPage = followerJson["maxPage"]							 # 粉丝页面数量
		followerList = followerJson["followers"]						 # 20个粉丝Json构成的列表
		print("ID为{}的用户粉丝列表含有{}页非匿名粉丝".format(UID,followerPage))
		print("正在获取ID为{}用户的第1页的信息...".format(UID))
		for follower in followerList:
			if repeatCheck:
				if follower["id"] in UIDs:continue
			if follower["followers_count"]>=threshold:
				flag = False
				content = ""
				for key in self.selectAttributes.keys():
					if flag:content += ","
					else:flag = True
					content += str(follower[key])
				""" 记录用户数据框 """
				self.lock.acquire()
				with open("{}\\{}".format(self.userFolder,self.userLog),"a") as f:
					f.write("{}\n".format(content))
				self.lock.release()
			""" 记录已经爬取过的用户 """
			UIDs.append(follower["id"])
			with open("{}\\{}".format(self.userFolder,self.UIDLog),"a") as g:
				g.write("{}\n".format(follower["id"]))
		if followerPage>1:												 # 不超过一页就在循环外处理完了
			for i in range(2,followerPage+1):
				print("正在获取ID为{}用户的第{}页的信息...".format(UID,i))
				try:followerHTML = self.session.get(self.followerURL.format(UID,i)).text
				except:
					print("无法访问ID为{}的用户第1页粉丝列表!".format(UID))
					with open(self.errorLog,"a") as f:
						f.write("无法访问ID为{}的用户第{}页粉丝列表!\t{}\n".format(UID,i,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
					return False
				if not followerHTML[2]=="c":							 # 排除访问出错
					print("在爬取ID为{}的用户第{}页粉丝列表时发生错误!".format(UID,i))
					with open(self.errorLog,"a") as f:
						f.write("在爬取ID为{}的用户第{}页粉丝列表时发生错误!\t{}\n".format(UID,i,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
					return False
				followerJson = json.loads(followerHTML)					 # 粉丝页面Json
				followerPage = followerJson["maxPage"]					 # 粉丝页面数量
				followerList = followerJson["followers"]				 # 20个粉丝Json构成的列表				
				for follower in followerList:
					if repeatCheck:
						if follower["id"] in UIDs:continue
					if follower["followers_count"]>=threshold:
						flag = False
						content = ""
						for key in self.selectAttributes.keys():
							if flag:content += ","
							else:flag = True
							content += str(follower[key])
						""" 记录用户数据框 """
						self.lock.acquire()
						with open("{}\\{}".format(self.userFolder,self.userLog),"a") as f:
							f.write("{}\n".format(content))
						self.lock.release()
					""" 记录已经爬取过的用户 """
					UIDs.append(follower["id"])
					with open("{}\\{}".format(self.userFolder,self.UIDLog),"a") as g:
						g.write("{}\n".format(follower["id"]))	
		return True

	def parse_followee(self,UID,UIDs,
		repeatCheck=True,
		threshold=1000
	):																	 # 获取指定ID的用户的关注列表用户信息
		try:followeeHTML = self.session.get(self.followeeURL.format(UID,1)).text
		except:
			print("无法访问ID为{}的用户第1页关注列表!".format(UID))
			with open(self.errorLog,"a") as f:
				f.write("无法访问ID为{}的用户第1页关注列表!\t{}\n".format(UID,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
			return False
		if not followeeHTML[2]=="c":
			print("在爬取ID为{}的用户第1页关注列表时发生错误!".format(UID))
			with open(self.errorLog,"a") as f:
				f.write("在爬取ID为{}的用户第1页关注列表时发生错误!\t{}\n".format(UID,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
			return False
		followeeJson = json.loads(followeeHTML)							 # 粉丝页面Json
		followeePage = followeeJson["maxPage"]							 # 粉丝页面数量
		followeeList = followeeJson["users"]							 # 20个粉丝Json构成的列表
		print("ID为{}的用户关注列表含有{}页用户".format(UID,followeePage))
		print("正在获取ID为{}用户的第1页的信息...".format(UID))
		for followee in followeeList:
			if repeatCheck:
				if followee["id"] in UIDs:continue
			if followee["followers_count"]>=threshold: 
				flag = False
				content = ""
				for key in self.selectAttributes.keys():
					if flag:content += ","
					else:flag = True
					content += str(followee[key])
				""" 记录用户数据框 """
				self.lock.acquire()
				with open("{}\\{}".format(self.userFolder,self.userLog),"a") as f:
					f.write("{}\n".format(content))
				self.lock.release()
			""" 记录已经爬取过的用户 """
			UIDs.append(followee["id"])
			self.lock.acquire()
			with open("{}\\{}".format(self.userFolder,self.UIDLog),"a") as g:
				g.write("{}\n".format(followee["id"]))
			self.lock.release()
		
		if followeePage>1:												 # 不超过一页就在循环外处理完了
			for i in range(2,followeePage+1):
				print("正在获取ID为{}用户的第{}页的信息...".format(UID,i))
				try:followeeHTML = self.session.get(self.followeeURL.format(UID,i)).text
				except:
					print("无法访问ID为{}的用户第{}页关注列表!".format(UID,i))
					with open(self.errorLog,"a") as f:
						f.write("无法访问ID为{}的用户第{}页关注列表!\t{}\n".format(UID,i,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
					return False
				if not followeeHTML[2]=="c":							 # 排除访问出错
					print("在爬取ID为{}的用户第{}页关注列表时发生错误!".format(UID,i))
					with open(self.errorLog,"a") as f:
						f.write("在爬取ID为{}的用户第{}页关注列表时发生错误!\t{}\n".format(UID,i,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
					return False
				followeeJson = json.loads(followeeHTML)					 # 粉丝页面Json
				followeePage = followeeJson["maxPage"]					 # 粉丝页面数量
				followeeList = followeeJson["users"]					 # 20个粉丝Json构成的列表				
				for followee in followeeList:
					if repeatCheck:
						if followee["id"] in UIDs:continue
					if followee["followers_count"]>=threshold: 
						flag = False
						content = ""
						for key in self.selectAttributes.keys():
							if flag:content += ","
							else:flag = True
							content += str(followee[key])
						""" 记录用户数据框 """
						self.lock.acquire()
						with open("{}\\{}".format(self.userFolder,self.userLog),"a") as f:
							f.write("{}\n".format(content))
						self.lock.release()
					""" 记录已经爬取过的用户 """
					UIDs.append(followee["id"])
					self.lock.acquire()
					with open("{}\\{}".format(self.userFolder,self.UIDLog),"a") as g:
						g.write("{}\n".format(followee["id"]))
					self.lock.release()
		return True

	def parse_first_follower(self,n,nJobs,maxPage,
		resetTime=600,
		threshold=1000
	):																	 # 获取第一个ID的用户的粉丝列表信息(因为一般选择粉丝数多的用户作为起始点,因此可以节约检验是否重复爬取的时间)
		count = 0
		page = n
		while page<=maxPage:
			try:followerHTML = self.session.get(self.followerURL.format(self.initialUID,page)).text
			except:
				print("无法访问ID为{}的用户第{}页粉丝列表!".format(self.initialUID,page))
				with open(self.errorLog,"a") as f:
					f.write("无法访问ID为{}的用户第{}页粉丝列表!\t{}\n".format(self.initialUID,page,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
				time.sleep(resetTime)
				continue
			if not followerHTML[2]=="c":								 # 排除访问出错
				print("在爬取ID为{}的用户第{}页粉丝列表时发生错误!".format(self.initialUID,page))
				with open(self.errorLog,"a") as f:
					f.write("在爬取ID为{}的用户第{}页粉丝列表时发生错误!\t{}\n".format(self.initialUID,page,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
				time.sleep(resetTime)
				continue
			followerJson = json.loads(followerHTML)						 # 粉丝页面Json
			followerList = followerJson["followers"]					 # 20个粉丝Json构成的列表
			print("正在获取ID为{}用户的第{}页的信息...".format(self.initialUID,page))
			for follower in followerList:
				if follower["followers_count"]>=threshold: 
					content = ""
					flag = False
					for key in self.selectAttributes.keys():
						if flag:content += ","
						else:flag = True
						content += str(follower[key])
					""" 记录用户数据框 """
					self.lock.acquire()
					with open("{}\\{}".format(self.userFolder,self.userLog),"a") as f:
						f.write("{}\n".format(content))
					self.lock.release()
				""" 记录已经爬取过的用户 """
				self.lock.acquire()
				with open("{}\\{}".format(self.userFolder,self.UIDLog),"a") as g:
					g.write("{}\n".format(follower["id"]))	
				self.lock.release()	
			count += 1		
			page = nJobs*count+n

	def parse_follower_and_followee(self,n,UIDs,nJobs,
		repeatCheck=True,
		resetTime=600
	):																	 # 多进程爬取用户信息的目标函数
		count = -1
		while True:
			count += 1
			flag1 = self.parse_follower(UIDs[n+nJobs*count],UIDs,repeatCheck)
			while not flag1:
				time.sleep(resetTime)
				flag1 = self.parse_follower(UIDs[n+nJobs*count],UIDs,repeatCheck)
			flag2 = self.parse_followee(UIDs[n+nJobs*count],UIDs,repeatCheck)	
			while not flag2:
				time.sleep(resetTime)
				flag2 = self.parse_followee(UIDs[n+nJobs*count],UIDs,repeatCheck)

	def parse_portfolio_data(self,symbols,ID):							 # 利用搜索引擎直接遍历所有组合
		savePath = "{}\\{}".format(self.portfolioFolder,datetime.datetime.now().strftime("%Y-%m-%d"))
		for symbol in symbols:
			print("正在处理编号为{}的组合\t{}".format(symbol,ID))
			while True:													 # while循环用于获得正确的HTML以及预先去除不存在的组合
				try:html = self.session.get(self.queryURL.format(symbol,10,1)).text
				except:
					print("无法访问编号为{}的组合!".format(symbol))
					self.lock.acquire()
					with open(self.errorLog,"a") as f:
						f.write("无法访问编号为{}的组合!\t{}\n".format(symbol,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
					self.lock.release()
					time.sleep(resetTime)
					continue
				try:string = html[2]
				except:
					print("访问编号为{}的组合返回HTML无信息!".format(symbol))
					self.lock.acquire()
					with open(self.errorLog,"a") as f:
						f.write("访问编号为{}的组合返回HTML无信息!\t{}\n".format(symbol,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
					self.lock.release()
					time.sleep(resetTime)
					continue					
				if not html[2]=="c":
					print("访问编号为{}的组合时搜索引擎频繁访问!".format(symbol))
					self.lock.acquire()
					with open(self.errorLog,"a") as f:
						f.write("访问编号为{}的组合时搜索引擎频繁访问!\t{}\n".format(symbol,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
					self.lock.release()
					time.sleep(resetTime)
					continue
				break	
			queryJson = json.loads(html)
			totalCount = queryJson["totalCount"]
			if totalCount==0:
				print("编号为{}的组合不存在\t{}".format(symbol,ID))
				self.lock.acquire()
				with open("{}_Non.txt".format(savePath),"a") as f:
					f.write("{}\n".format(symbol))
				self.lock.release()
				continue
			maxPage = int(queryJson["maxPage"])
			portfolios = queryJson["list"]
			flag = True
			for portfolio in portfolios:
				if portfolio["symbol"]==symbol:
					flag = False
					content = ""
					count = -1
					for key in self.portfolioRecord.keys():
						count += 1
						if count:content += ",{}".format(portfolio[key])
						else:content += "{}".format(portfolio[key])
					self.lock.acquire()
					with open("{}_P.txt".format(savePath),"a") as f:
						f.write("{}\n".format(content)) 
					self.lock.release()

	def parse_portfolio_history(self,symbol,
		maxCount=50,
		maxPage=50,
		interval=8,
	):																	 # 获取指定编号的组合调仓历史数据并转为csv文件(maxCount为一页上最多能放的调仓数据条数,maxPage为最多能查询到的数据页数)
		print("正在爬取编号为{}的组合调仓历史数据...".format(symbol))
		html = self.session.get(self.historyURL.format(symbol,1,1)).text # 获取最新的一条调仓数据,观察是否访问出错
		print(html[:100])
		if not html[2]=="c":											 # 访问正常时的HTML开头为“{"count:1,"page:...”
			print("在预备爬取编号为{}的组合调仓历史数据时发生错误!".format(symbol))
			with open(self.errorLog,"a") as f:
				f.write("在预备爬取编号为{}的组合调仓历史数据时发生错误!\t{}\n".format(symbol,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
			return False									
		tempJson = json.loads(html)
		total = tempJson["totalCount"]									 # 获取调仓总次数
		page = min(math.ceil(total/maxCount),maxPage)					 # 计算总页面数量(超过maxPage以maxPage计算,无论一页放多少条调仓记录都最多50页,一页调仓记录上限是50条)
		print("一共有{}页调仓数据,每页{}条调仓数据".format(page,maxCount))
		historyRecord = self.historyRecord.copy()
		for i in range(1,page+1):
			print("正在获取第{}页的信息...".format(i))
			html = self.session.get(self.historyURL.format(symbol,maxCount,i)).text
			if html[2]=="e":
				print("在爬取编号为{}的组合调仓历史数据中途发生错误!页码数{}".format(symbol,i))
				with open(self.errorLog,"a") as f:
					f.write("在爬取编号为{}的组合调仓历史数据中途发生错误!页码数{}\t{}\n".format(symbol,i,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
				return False											 # 表明在爬取一只组合的调仓数据中途出错
			historyJson = json.loads(html)								 # 页面HTML转字典
			historyList = historyJson["list"]							 # 字典"list"键保存50条调仓记录值
			for history in historyList:									 # 遍历50次调仓记录
				if history["category"]=="user_rebalancing":				 # 需要用户调仓而非系统调仓(sys_rebalancing通常为股息分红)
					for detail in history["rebalancing_histories"]:		 # 在一次调仓记录中有若干只股票的调整(都是同一时间更新的,保存在rebalancing_histories键下)
						historyRecord["PortCode"].append(symbol)
						historyRecord["Updated"].append(time.strftime("%Y-%m-%d %H:%I:%S",time.localtime(detail["updated_at"]/1000)))
						historyRecord["SecuCode"].append(detail["stock_symbol"])
						historyRecord["StockName"].append(detail["stock_name"])
						historyRecord["PrevWeight"].append(detail["prev_weight_adjusted"] if detail["prev_weight_adjusted"] else 0.0)
						historyRecord["TargetWeight"].append(detail["target_weight"])
			time.sleep(random.uniform(0,interval))
		pandas.DataFrame(historyRecord).to_csv("{}\\{}.csv".format(self.historyFolder,symbol),index=False,header=False)
		return True

	def parse_portfolio_net(self,symbol,errorLog=False):				 # 获取指定编号的组合净值数据并转为csv文件
		html = self.session.get(self.netURL.format(symbol)).text
		if not html[3]=="s":											 # 访问正常时的HTML开头为“[{"symbol":...”
			print("在爬取编号为{}的组合净值数据时发生错误!".format(symbol))
			if errorLog:
				with open(self.errorLog,"a") as f:
					f.write("在爬取编号为{}的组合净值数据时发生错误!\t{}\n".format(symbol,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
			return False
		netList = json.loads(html)[0]["list"]
		netRecord = self.netRecord.copy()
		for netJson in netList:
			netRecord["PortCode"].append(symbol)
			netRecord["NavDate"].append(netJson["date"])
			netRecord["Nav"].append(netJson["value"])
		pandas.DataFrame(netRecord).to_csv("{}\\{}.csv".format(self.netFolder,symbol),index=False,header=False)
		return True

	def parse_portfolio_history_multiprocess(self,symbols,ID):
		for symbol in symbols:
			print("正在获取编号为{}的组合净值与调仓数据\t{}".format(symbol,ID))
			flag = self.parse_portfolio_history(symbol)
			while not flag:
				flag = self.parse_portfolio_history(symbol)		

	def parse_portfolio_net_multiprocess(self,symbols,ID,
		interval=10,
	):																	 # 多进程获取组合净值数据(我发现净值访问出错可以通过暴力连续访问突破错误)
		for symbol in symbols:
			print("正在获取编号为{}的组合净值与调仓数据\t{}".format(symbol,ID))
			flag = self.parse_portfolio_net(symbol)
			while not flag:
				flag = self.parse_portfolio_net(symbol)
			time.sleep(random.uniform(0,interval))

	def get_user_data(self,
		nJobs=10
	):																	 # 获取用户数据
		UIDs = Manager().list()
		processes = []
		with open("{}\\{}".format(self.userFolder,self.UIDLog),"r") as f:# 读取已经获取到的列表	
			for UID in f.read().split("\n")[:-1]:
				UIDs.append(UID)		
		if len(UIDs)==1:												 # UIDs里只有一个元素表明这是最原始的一次爬取(因为第一个用户有1500W不重复粉丝,因此可以直接在这里用多进程)
			followerHTML = self.session.get(self.followerURL.format(self.initialUID,1)).text
			maxPage = json.loads(followerHTML)["maxPage"]	
			for i in range(1,nJobs+1):
				process = Process(target=self.parse_first_follower,args=(i,nJobs,maxPage,))
				process.start()
				processes.append(process)
			for process in processes:
				process.join()
		else:															 # 已经爬取过一些用户的多进程	
			for i in range(nJobs):
				process = Process(target=self.parse_follower_and_followee,args=(i,UIDs,nJobs,True,))
				process.start()
				processes.append(process)
			for process in processes:
				process.join()
	
	def get_portfolio_data(self,
		nJobs=15,
		maxSymbol=1500000,
		minSymbol=0,
	):																	 # 多进程采用搜索引擎方法获取组合数据
		processes = []
		num = int((maxSymbol-minSymbol)/nJobs)
		symbols = ["ZH{}".format(self.transfer_symbol(i)) for i in range(minSymbol,maxSymbol)]
		for i in range(nJobs):
			process = Process(target=self.parse_portfolio_data,args=(symbols[num*i:num*(i+1)],i+1,))
			process.start()
			processes.append(process)
		for process in processes:
			process.join()

	def select_good_portfolio(self,dataPath,
		market="cn",
		netThreshold=1.10,
		annualThreshold=5,
		monthThreshold=1,
		dayThreshold=0,
		marketFlag=True,
		netFlag=True,
		annualFlag=True,
		monthFlag=True,
		dayFlag=True,
		resetTime=600,
		sep=",",
		header=None
	):																	 # 生成符合查询条件的组合编号文本并返回组合数据框
		data = pandas.read_table(dataPath,sep=sep,header=header,low_memory=False)
		data.columns = [key for key in self.portfolioRecord.keys()]
		data[data["monthly_gain"]=="None"] = "0.0"
		data[data["daily_gain"]=="None"] = "0.0"
		data[data["net_value"]=="None"] = "0.0"
		data[data["annualized_gain_rate"]=="None"] = "0.0"
		data["monthly_gain"] = pandas.to_numeric(data["monthly_gain"])
		data["daily_gain"] = pandas.to_numeric(data["daily_gain"])
		data["net_value"] = pandas.to_numeric(data["net_value"])
		data["annualized_gain_rate"] = pandas.to_numeric(data["annualized_gain_rate"])
		flag = True
		if marketFlag:flag&=(data["market"]==market) 
		if netFlag:flag&=(data["net_value"]>=netThreshold) 
		if annualFlag:flag&=(data["annualized_gain_rate"]>=annualThreshold) 
		if monthFlag:flag&=(data["monthly_gain"]>=monthThreshold) 
		if dayFlag:flag&=(data["daily_gain"]>=dayThreshold) 
		result = data[flag]
		with open(self.portfolioLog,"w") as f:
			for symbol in result["symbol"]:
				f.write("{}\n".format(symbol))
		return result
			
	def get_portfolio_history(self):
		with open(self.portfolioLog,"r") as f:
			symbols = f.read().split("\n")[:-1]
		for symbol in symbols:
			flag = self.parse_portfolio_history(symbol)
			while not flag:
				flag = self.parse_portfolio_history(symbol)
			
	def get_portfolio_history_multiprocess(self,
		nJobs=10,
	):																	 # 获取全部调仓记录
		with open(self.portfolioLog,"r") as f:
			symbols = f.read().split("\n")[:-1]
		processes = []
		num = int(len(symbols)/nJobs)
		for i in range(nJobs):
			process = Process(target=self.parse_portfolio_history_multiprocess,args=(symbols[num*i:num*(i+1)],i+1,))
			process.start()
			processes.append(process)
		for process in processes:
			process.join()	
	
	def get_portfolio_net(self,
		nJobs=16,
	):																	 # 获取全部净值数据
		with open(self.portfolioLog,"r") as f:
			symbols = f.read().split("\n")[:-1]
		processes = []
		num = int(len(symbols)/nJobs)
		for i in range(nJobs):
			process = Process(target=self.parse_portfolio_net_multiprocess,args=(symbols[num*i:num*(i+1)],i+1,))
			process.start()
			processes.append(process)
		for process in processes:
			process.join()

	def update_portfolio_history(self,
		resetTime=600,
		maxCount=50,
	): 																	 # 更新类创建目录(调仓记录)下所有组合的信息(csv中记录的字段顺序是:"组合编号","当前权重","股票代码","股票名称","目标权重","更新时间")
		for root,dirs,files in os.walk("{}\\{}".format(self.workSpace,self.historyLog)):
			for fileName in files:
				print("正在更新调仓历史文件{}...".format(fileName))
				data = pandas.read_csv(fileName,header=None,nrows=1,sep=",")
				lastTime = datetime.datetime.strptime(data.loc[0,5],"%Y-%m-%d %H:%I:%S")
				index = fileName.find(".")
				symbol = fileName[:index]								 # 获取组合编号
				page = 0
				content = ""
				while True:												 # 一页一页地遍历确定最新的调仓
					page += 1
					historyHTML = self.session(self.historyURL.format(symbol,maxCount,page))
					while not historyHTML[2]=="c":						 # 排除访问出错
						print("更新编号为{}的组合调仓记录时发生错误!".format(symbol))
						with open(self.errorLog,"a") as f:
							f.write("更新编号为{}的组合调仓记录时发生错误!\t{}\n".format(symbol,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
						time.sleep(resetTime)
						historyHTML = self.session(self.historyURL.format(symbol,maxCount,page))
					historyJson = json.loads(historyHTML)
					historyList = historyJson["list"]
					flag = False
					for history in historyList:
						timeString = time.strftime("%Y-%m-%d %H:%I:%S",time.localtime(history["updated_at"]/1000))
						updateTime = datetime.datetime.strptime(timeString,"%Y-%m-%d %H:%I:%S")
						if updateTime<=lastTime:						 # 若更新时间不大于目前的最新时间
							flag = True
							break					 
						else:
							content += "{},{},{},{},{},{}\n".format(symbol,history["prev_weight_adjusted"] if detail["prev_weight_adjusted"] else 0.0,history["stock_symbol"],history["stock_name"],history["target_weight"],timeString)
					if flag:break
				if content:												 # 如果没有更新则不会去修改
					with open(fileName,"r+") as f:
						origin = f.read()
						f.seek(0,0)
						f.write("{}{}".format(content,origin))
	
	def update_portfolio_net(self,
		resetTime=600
	):																	 # 更新类创建目录(净值)下所有组合的信息(csv中记录的字段顺序是:"净值","更新时间","组合编号")
		for root,dirs,files in os.walk("{}\\{}".format(self.workSpace,self.netLog)):
			for fileName in files:
				print("正在更新净值文件{}...".format(fileName))
				with open(fileName,"r") as f:
					lastLine = f.read().split("\n")[-2]					 # 与调仓历史数据时间逆序排列不同,净值数据是按照时间顺序排列的
				lastTime = datetime.datetime.strptime(lastLine.split(",")[1],"%Y-%m-%d")
				index = fileName.find(".")
				symbol = fileName[:index]								 # 获取组合编号
				page = 0
				content = ""
				html = self.session.get(self.netURL.format(symbol)).text
				while not html[3]=="s":									 # 排除访问出错
					print("更新编号为{}的组合净值时发生错误!".format(symbol))
					with open(self.errorLog,"a") as f:
						f.write("更新编号为{}的组合净值时发生错误!\t{}\n".format(symbol,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
					time.sleep(resetTime)
					html = self.session.get(self.netURL.format(symbol)).text
				netList = json.loads(html)[0]["list"]
				for netJson in netList:
					updateTime = datetime.datetime.strptime(netJson["date"],"%Y-%m-%d")
					if updateTime<=lastTime:
						break
					else:
						content += "{},{},{}\n".format(netJson["value"],netJson["data"],symbol)
				if content:
					with open(fileName,"a") as f:
						f.write(content)
	
	def merge_csv(self,csvPath,objectName):								 # 用于合并csv文件(在命令行中使用批处理代码)
		os.system("type {}\\*.csv > {}".format(csvPath,objectName))      # 这里有一个天坑:文件路径与文件名称无论如何不能含有空格,否则批处理代码语法错误

	def merge_txt(self,txtPath,objectName):
		os.system("type {}\\*.txt > {}".format(txtPath,objectName))

	def transfer_symbol(self,n):										 # 将自然数转为编号的小函数
		if n<10: return "00000{}".format(n)
		if n<100: return "0000{}".format(n)
		if n<1000: return "000{}".format(n)
		if n<10000: return "00{}".format(n)
		if n<100000: return "0{}".format(n)
		return "{}".format(n)

	def refresh_IPs(self,resetTime=300):								 # 更新IP池(暂时无用)
		self.IPs = IP(headers=userAgent).get_wn_IP()
	
	def refresh_session(self):											 # 更新类成员变量session的代理(暂时无用)
		while True:
			self.session.proxies = {"https":"https://{}".format(self.IPs[random.randint(0,len(self.IPs)-1)])}
			try:
				self.session.get(self.mainURL)
				print("重新选取代理IP成功!")
				break
			except: print("选取的代理IP不可用,正在重新选择...")

	def new_session(self):												 # 创建一个配备代理IP的新session(暂时无用)
		session = requests.Session()
		session.headers = {"User-Agent":self.userAgent}
		return session

if __name__ == "__main__":
	print("测试开始...")
	snowball = Snowball(telephone="国内手机号码")
	snowball.login()
	#snowball.get_portfolio_net()
	#snowball.get_portfolio_data()
	snowball.get_portfolio_history()
	#snowball.get_portfolio_history_multiprocess()

 Snowball类调用了IP类,详见https://blog.csdn.net/CY19980216/article/details/84883365

(未完待续)


2018.12.19更新

最近发现调仓数据出现无法爬取的问题,检测后发现短信验证码登录的返回结果是“登录访问过于频繁”(当然我一天只登一次也会报错)。经过测试找到了解决方法——先用浏览器登录一次,复制下登录时的cookie(我找的是"AQAAAKTCMDO5LQkA/wFeZQIWGR34f/iG; xq_a_token=6125633fe86dec75d9edcd37ac089d8aed148b9e; xq_a_token.sig=CKaeIxP0OqcHQf2b4XOfUg-gXv0; xq_r_token=335505f8d6608a9d9fa932c981d547ad9336e2b5; xq_r_token.sig=i9gZwKtoEEpsL9Ck0G7yUGU42LY; u=471544938460796; Hm_lvt_1db88642e346389874251b5a1eded6e3=1544938461; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1544938461; device_id=8811e70b46b0adaa9496184d828c6f1d; _ga=GA1.2.1956277879.1544938463; _gid=GA1.2.679564619.1544938463; _gat_gtag_UA_16079156_4=1")。然后在登录时将该cookie加到requests.Session()对象的headers中(键名为"Cookie")。然后就可以用很长一段时间了(反正到现在一个星期前复制的这个假cookie还能用,还不用换)。

然后发现虽然登录成功了(因为再次访问雪球网发现已经可以看到自己的用户名了),但是仍然不可以访问调仓数据。我百思不得其解,终于在今天一个巧合,我在登录成功后再把requests.Session()对象的headers中的cookie给删掉,就可以访问调仓数据了。这个在之前是从来没有发生过的问题。

目前正在测试多进程获取调仓数据(代码已修正,未完待续)

 

 

你可能感兴趣的:(股票,python,日常,项目总结,囚生的爬虫之旅)