该程序主要实现自动从http://sc.chinaz.com/爬取ppt素材, 并将其保存到Mysql, 其中使用Mysql进行线程间协作.
1, 使用urllib获取指定网页内容:
def get_html(url):
request = urllib.request.Request(url)
request.set_proxy('135.*.*.*:88**','http')
page = urllib.request.urlopen(request)
html = page.read()
return html
def download_ppt(ppt_url, name):
if os.path.exists(name):
print("file exist")
else:
print("download ", ppt_url)
ppt_data = get_html(ppt_url)
output = open(name, 'wb+')
output.write(ppt_data)
output.close
3, 使用多线程进行运行爬虫,并使用mysql保存爬取内容:
class mythread(threading.Thread):
def __init__(self, pages):
self.pages = pages
threading.Thread.__init__(self)
try:
self.conn=pymysql.connect(host='localhost',user='root',passwd='',db='python_learn',port=3306, charset='utf8')
self.cur = self.conn.cursor()
except Exception as e:
print(e)
def download_page(self,pages):
download_num = 0
for index in pages:
url = ''
if index == 1:
url = 'http://sc.chinaz.com/ppt/index.html'
else:
url = 'http://sc.chinaz.com/ppt/index_%d.html' % index
html = get_html(url)
mod_re = r'
import os
import urllib.request
import urllib.parse
import re
import threading
import pymysql
def get_html(url):
request = urllib.request.Request(url)
request.set_proxy('135.2.77.29:8895','http')
page = urllib.request.urlopen(request)
html = page.read()
return html
def download_ppt(ppt_url, name):
if os.path.exists(name):
print("file exist")
else:
print("download ", ppt_url)
ppt_data = get_html(ppt_url)
output = open(name, 'wb+')
output.write(ppt_data)
output.close
class mythread(threading.Thread):
def __init__(self, pages):
self.pages = pages
threading.Thread.__init__(self)
try:
self.conn=pymysql.connect(host='localhost',user='root',passwd='',db='python_learn',port=3306, charset='utf8')
self.cur = self.conn.cursor()
except Exception as e:
print(e)
def download_page(self,pages):
download_num = 0
for index in pages:
url = ''
if index == 1:
url = 'http://sc.chinaz.com/ppt/index.html'
else:
url = 'http://sc.chinaz.com/ppt/index_%d.html' % index
html = get_html(url)
mod_re = r'