使用PYTHON3写了一个简单爬虫, 通过公司代理爬取ppt素材

该程序主要实现自动从http://sc.chinaz.com/爬取ppt素材,  并将其保存到Mysql, 其中使用Mysql进行线程间协作.


1, 使用urllib获取指定网页内容:

def get_html(url):
    request = urllib.request.Request(url)
    request.set_proxy('135.*.*.*:88**','http')
    page = urllib.request.urlopen(request)
    html = page.read()
    return html

2, 当获取PPT路径时进行保存:

def download_ppt(ppt_url, name):
	if os.path.exists(name):
		print("file exist")
	else:
		print("download ", ppt_url)
		ppt_data = get_html(ppt_url)
		output = open(name, 'wb+')
		output.write(ppt_data)
		output.close
3, 使用多线程进行运行爬虫,并使用mysql保存爬取内容:

class mythread(threading.Thread):
	def __init__(self, pages):
		self.pages = pages
		threading.Thread.__init__(self) 
		try:
			self.conn=pymysql.connect(host='localhost',user='root',passwd='',db='python_learn',port=3306, charset='utf8')
			self.cur = self.conn.cursor()
		except Exception as e:
			print(e)

	def download_page(self,pages):
		download_num = 0
		for index in pages:
			url = ''
			if index == 1:
				url = 'http://sc.chinaz.com/ppt/index.html'
			else:
				url = 'http://sc.chinaz.com/ppt/index_%d.html' % index
			
			html = get_html(url)
			mod_re = r'



完整代码如下:

import os
import urllib.request
import urllib.parse
import re
import threading
import pymysql

def get_html(url):
    request = urllib.request.Request(url)
    request.set_proxy('135.2.77.29:8895','http')
    page = urllib.request.urlopen(request)
    html = page.read()
    return html

def download_ppt(ppt_url, name):
	if os.path.exists(name):
		print("file exist")
	else:
		print("download ", ppt_url)
		ppt_data = get_html(ppt_url)
		output = open(name, 'wb+')
		output.write(ppt_data)
		output.close

class mythread(threading.Thread):
	def __init__(self, pages):
		self.pages = pages
		threading.Thread.__init__(self) 
		try:
			self.conn=pymysql.connect(host='localhost',user='root',passwd='',db='python_learn',port=3306, charset='utf8')
			self.cur = self.conn.cursor()
		except Exception as e:
			print(e)

	def download_page(self,pages):
		download_num = 0
		for index in pages:
			url = ''
			if index == 1:
				url = 'http://sc.chinaz.com/ppt/index.html'
			else:
				url = 'http://sc.chinaz.com/ppt/index_%d.html' % index
			
			html = get_html(url)
			mod_re = r'




你可能感兴趣的:(PYTHON)