import pymysql
from itertools import chain
import numpy
import time
import queue
import threading
# from spiderPages import *
exitFlag = 0
# 数据库连接及处理数据
class spider:
#初始化
def __init__(self,is_threaded,threaded_number):
# 是否多线程
self.is_threaded = is_threaded
self.threaded_number = threaded_number
# 链接数据库
def connmysql(self):
# conn = pymysql.connect(host="127.0.0.1",user="root",passwd="xxx",db="xxx",charset="utf8")
# cursor=conn.cursor()
# sql = "SELECT * from xx'"
# cursor.execute(sql)
# results=cursor.fetchall()
# results = ['www.baidu.com','www.aaa.com']
#测试数据
results = (('http://www.1.com',), ('http://www.2.com',), ('http://www.3.com',), ('http://www.4.com',), ('http://www.5.com',), ('http://www.6.com',), ('http://www.7.com',))
return results
def list_split(self,items, n):
return [items[i:i+n] for i in range(0, len(items), n)]
def get_url_data(self):
res = self.connmysql()
reslist=list(chain.from_iterable(res))
return reslist
# 多线程类
class myThread (threading.Thread):
def __init__(self, threadID, name, q):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.q = q
def run(self):
print ("开启线程:" + self.name)
process_data(self.name, self.q)
print ("退出线程:" + self.name)
# 去调用爬虫类的方法
def spiderpage(url):
pass
# TODO
# fout = open('test.txt', 'a')
# fout.write(url+'\r\n')
# fout.close()
def process_data(threadName, q):
while not exitFlag:
queueLock.acquire()
if not workQueue.empty():
data = q.get()
queueLock.release()
spiderpage(data)
print ("%s processing %s" % (threadName, data))
else:
queueLock.release()
time.sleep(1)
spider = spider(True,2)
nameList = spider.get_url_data()
threadList = ["线程1", "线程2","线程3"]
queueLock = threading.Lock()
workQueue = queue.Queue(10)
threads = []
threadID = 1
# 创建新线程
for tName in threadList:
thread = myThread(threadID, tName, workQueue)
thread.start()
threads.append(thread)
threadID += 1
# 填充队列
queueLock.acquire()
for word in nameList:
workQueue.put(word)
queueLock.release()
# 等待队列清空
while not workQueue.empty():
pass
# 通知线程是时候退出
exitFlag = 1
# 等待所有线程完成
for t in threads:
t.join()