#! /usr/bin/python2 # coding=utf-8 import urllib2 import csv import time import pymongo import xlrd import xlwt import sys import mechanize from datetime import datetime from datetime import timedelta global stocks global detail global list_code global conn_list, conn_detail list_code = [] def init_mongodb_list(): global stocks global conn_list # mongodb_link = 'mongodb://127.0.0.1:27017' # mongoClient = MongoClient(mongodb_link) conn_list = pymongo.MongoClient("localhost", 27017) conn_list.stock_list.authenticate("d", "zz") db = conn_list.stock_list stocks = db.stocks # return stocks def init_mongodb_day_detail(): global detail global conn_detail conn_detail = pymongo.MongoClient("localhost", 27017) conn_detail.day_detail.authenticate("d", "zz") db = conn_detail.day_detail detail = db.detail def get_newst_date(code): date = "" time1 = "00:00:00" l = detail.find({"code": code}).sort([('date', -1), ('time', -1)]).limit(1) for i in l: code = i.get("code") date = i.get("date") time1 = i.get("time") print code, date, time1 # print l.get("date") # print code, date,"----------------------------------------" return date, time1 # sina def update_ri(code, date, time, price, price_fluncuation, volume, turnover, nature): print "sql", code, date, time, price, price_fluncuation, volume, turnover, nature detail.update({"code": code, "date": date, "time": time}, {"$setOnInsert":{"price": price, "price_fluncuation": price_fluncuation, "volume": volume, "turnover": turnover, "nature": nature}}, upsert = True) def get_stock_list(): l = stocks.find() #j = 0 for i in l: code = i.get("code") name = i.get("name") date_start = i.get("date_start") #print code,name,date_start,len(code),len(name),len(date_start) #date_start1 = "1999-11-10" #print len(date_start),len(date_start1), date_start1, date_start list_code.append({"code": code, "name": name, "date_start":date_start}) #print list_code[j]["date_start"], len(list_code[j]["date_start"]) #j+=1 # print type(name), name, type(code),code #print list_code def day_plus(str): # now = datetime.now() day = datetime.strptime(str, "%Y-%m-%d") day_diff = timedelta(days=1) day = day + day_diff return day def day_plus(str): day = datetime.strptime(str, "%Y-%m-%d") return day def day_plus1(day): day_diff = timedelta(days=1) day = day + day_diff return day def day_str_change(str): day = datetime.strptime(str, "%Y-%m-%d") return day.strftime('%Y%m%d') def stock_header(code): url = '&symbol=' t1 = ('60', '900') t2 = ('000', '002', '300', '200') t3 = ('399001', '399006') if code.startswith(t1): str = 'sh' + code #elif code.startswith('000001'): #str = '0' + code #elif code.startswith(t3): #str = '1' + code elif code.startswith(t2): str = 'sz' + code else: str = code print code url = url + str return url def deal_url(str_day, url): if ("" == str_day): print url return url; # sina data 成交时间 成交价 价格变动 成交量 成交额 性质 str_url = "http://market.finance.sina.com.cn/downxls.php?date=" + str_day + url #print str_url return str_url def file_to_sql(code, date, content): #print content #print "--------------" rows = content.split('\n') rows.sort() #print rows i = 0 cnt = len(rows) - 1 for row in rows: # 忽略第一行 if ((i == 0) or (i == cnt)): i += 1 print "d", i continue split_row = row.split("\t") #full_data = [] #for row_s in split_row: # full_data.append(row_s) #print split_row,"--------------------" #print "-------------" # split_row[1] = int(split_row[1]) # full_data.append(split_row) # print "-----------------type--",type(split_row), split_row #print split_row[1],split_row[0] try: # yahoo full_data = [] for row_s in split_row: #print row_s str = row_s.strip().replace('--', '0') #print "str",str full_data.append(str) #print full_data,len(full_data) if (6 != len(full_data)): continue #if (False == full_data[2].isdigit()): # full_data[2] = '0' #print full_data, len(full_data) str_nature = full_data[5].decode('gb18030') #print str_nature.encode("utf8") #print code, date, full_data[0], float(full_data[1]), float(full_data[2]), float(full_data[3]), float(full_data[4]), full_data[5] update_ri(code, date, full_data[0], float(full_data[1]), float(full_data[2]), float(full_data[3]), float(full_data[4]), str_nature.encode("utf8")) # 163 ''' full_data = [] for row_s in split_row: # print row_s, len(row_s) str = row_s.replace("\r", '').replace('None', '0') # print str if ('' == str): # print "kong" str = '0' # print row_s # print "--------------------" full_data.append(str) # print row_s,"---" # print code, len(full_data) # print full_data # print split_row,i if (16 != len(full_data)): break # print "full",full_data update_ri(code, full_data[0], float(full_data[3]), float(full_data[4]), float(full_data[5]), float(full_data[6]), \ float(full_data[7]), float(full_data[8]), float(full_data[9]), float(full_data[10]), float(full_data[11]), \ float(full_data[12]), float(full_data[13]), float(full_data[14]), float(full_data[15])) ''' except ValueError: print '\033[1;31;40m' print split_row print "--------------------ValueError----------------------------------------------" print '\033[0m' continue i += 1 ''' def get_day(code, url): print code, url if ("" == url): print "---newst---date---------------------------------" return # url = 'http://quotes.money.163.com/service/chddata.html?code=1000002' # url = 'http://quotes.money.163.com/service/chddata.html?code=0601398&start=20000720&end=20150508' # url = 'http://table.finance.yahoo.com/table.csv?s=000002.sz' # url = 'http://table.finance.yahoo.com/table.csv?s=000002.sz&d=6&e=22&f=2006&g=d&a=11&b=16&c=1991&ignore=.csv' # url = 'http://hq.sinajs.cn/?list=sh600127' # http://market.finance.sina.com.cn/downxls.php?date=2016-10-28&symbol=sz300127 # print url req_header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Accept': 'text/html;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'gzip', 'Connection': 'close', 'Referer': None # 注意如果依然不能抓取的话,这里可以设置抓取网站的host } req_timeout = 500 req = urllib2.Request(url,None,req_header) #req = urllib2.Request(url) # print req # 如果不需要设置代理,下面的set_proxy就不用调用了。由于公司网络要代理才能连接外网,所以这里有set_proxy… # req.set_proxy('proxy.XXX.com:911', 'http') # socket = urllib2.urlopen(req,None,req_timeout) import pandas as pd try: socket = urllib2.urlopen(req, None, req_timeout) # print socket content = socket.read() # content = socket.read().decode('GB18030') socket.close() except urllib2.HTTPError, e: print '\033[1;31;40m' print 'The server couldn\'t fulfill the request.' print 'Error code: ', e.code print 'Error reason: ', e.reason print '\033[0m' except urllib2.URLError, e: print '\033[1;31;40m' print 'We failed to reach a server.' print 'Reason: ', e.reason print '\033[0m' else: # everything is fine file_to_sql(code, content) #print type(content) #read_excel(content) ''' def browser(url): print url if ("" == url): print "---newst---date---------------------------------" return "" br = mechanize.Browser() #options br.set_handle_equiv(True) #br.set_handle_gzip(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) #Follows refresh 0 but not hangs on refresh > 0 br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) br.set_debug_http(True) br.set_debug_redirects(True) br.set_debug_responses(True) #欺骗行为 br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] #上面的代码主要用于初始化设置,最好设置一下 # 打开百度 r = br.open(url) #获取百度的表单,从中找到输入汉字的位置 ''' for f in br.forms(): print f br.select_form(nr = 0) #搜索关键字“火车” br.form['wd'] = "火车" br.submit() # 查看搜索结果 brr=br.response().read() #是html代码,能看到火车的搜索结果 print brr ''' content = r.read().strip() #print content ''' rows = content.split('\n') rows.sort() print rows print "---------------" print len(rows) i = 0 cnt = len(rows) - 1 for row in rows: print row print cnt ''' return content def get_day_list(code, date_start): str_day, time_day = get_newst_date(code) print str_day url = stock_header(code) # print url now = datetime.now() str_now = now.strftime("%Y-%m-%d") #time_now = now.strftime("%H:%M:%S") #str_day="2015-11-11" print "start date ",date_start,len(date_start) if ("" == str_day): day = datetime.strptime(date_start, "%Y-%m-%d") else: day = datetime.strptime(str_day, "%Y-%m-%d") if ("15:00:00" <= time_day): day = day_plus1(day) print str_day,time,date_start while (day.strftime("%Y-%m-%d") <= str_now): week = day.weekday() if ((5 == week) or (6 == week)): day = day_plus1(day) continue str_day = day.strftime("%Y-%m-%d") url_all = deal_url(str_day, url) # print url #get_day(code, url_all) content = browser(url_all) # print content if ("" != content): file_to_sql(code, str_day, content) day = day_plus1(day) if __name__ == '__main__': init_mongodb_list() print stocks.count() get_stock_list() conn_list.close() # print list_code init_mongodb_day_detail() # get_code_k_ri() print len(list_code) #for i in list_code: # print i["code"],i["date_start"] for code in list_code: get_day_list(code["code"], code["date_start"]) conn_detail.close()