Python爬虫学习笔记——Python基础

Python爬虫学习笔记——Python基础

1 IO编程

1.1 文件读写

Python内置了读写文件的函数,语法为:
open(name[.mode[.buffering]])

#打开文件
f = open(r'C:\text\myTextFile.txt')
#读取文件
with open(r'C:\text\myTextFile.txt', 'r') as fileReader:
    print fileReader.read()
    #按每次读取一行内容
    for line in fileReader.readlines():
        print line.strip()
 #写入文件
with open(r'C:\text\myTextFile.txt', 'w') as fileReader:
    fileReader.write('myTextFile')

1.2 操作文件和目录

import os
import shutil
#获取当前python脚本工作的目录路径
os.getcwd()
#返回指定目录的所有文件和目录名
os.listdir()
#删除文件
os.remove(filepath)
#删除多个空目录
os.removedirs(r'D:\python')
#检验给出的路径是否是一个文件
os.path.isfile(filepath)
#检验给出的路径是否是一个目录
os.path.isdir(filepath)
#判断是否是绝对路径
os.path.isabs()
#检验路径是否存在
os.path.exists(r'D:\python')
#分离一个路径的目录名和文件名
os.path.split()
#分离扩展名
os.path.splitext()
#获取路径名
os.path.dirname(filepath)
#获取文件名
os.path.basename(filepath)
#读取和设置环境变量
os.getenv()
os.putenv()
#给出当前平台使用的行终止符
os.linesep #'\r\n' in Windows, '\n' in Linux
#指示正在使用的平台
os.name
#重命名文件或者目录
os.rename(old, new)
#创建多级目录
os.makedirs(r'C:\python\test')
#创建单个目录
os.mkdir('test')
#获取文件属性
os.stat(file)
#修改文件权限与时间戳
os.chmod(file)
#获取文件大小
os.path.getsize(filename)
#复制文件夹
shutil.copytree('olddir', 'newdir')
#复制文件
shutil.copyfile('oldfile', 'newfile')
#移动文件
shutil.move('oldpos', 'newpos')
#删除目录
os.rmdir('dir') #只能删除空目录
shutil.rmtree('dir') #可以删除空目录

1.3 序列化操作(把内存的变量变成可存储或传输的过程)

import cPickle as pickle
d = dict(url='index.html', title='home page', content ='home page')
pickle.dumps(d)
f = open(r'D:\dump.txt', 'wb')
pickle.dump(d, f)
f.close()

#反序列化
f = open(r'D:\dump.txt', 'rb')
d = pickle.load(f)
f.close()
d

2 进程和线程

进程是程序在计算机上的执行活动,当运行一个程序是,就启动一个进程。在Windows系统中,进程被细化为线程,作为可以独立运行的单位。多进程,也就是说同一个系统中允许多个进程处于运行状态,也称为多任务。在单CPU里实现多进程,需要使用并发技术。

2.1 多进程

使用os模块的fork方法实现多进程,fork方法调用一次,返回两次,操作系统将当前父进程复制出一份子进程,父进程返回子进程的ID,而子进程永远返回0。

import os
if __name__ == '__main__':
    print('current process %s start ... ' %(os.getpid()))
    pid = os.fork()
    if pid < 0:
        print('error in fork')
    elif pid == 0:
        print('I am child process %s and my parent process is %s' %(os.getpid(), os.getppid()))
    else:
        print('I %s created a child process %s.' %(os.getpid(), pid))

使用multiprocessing模块实现多进程

import os
from multiprocessing import Process
def run_proc(name):
    print('child process %s (%s) running...' %(name, os.getpid()))
if __name__ == '__main__':
    print('parent process %s' %os.getpid())
    for i in range(5):
        p = Process(target=run_proc, args=(str(i), ))
        print('Process will start.')
        p.start()
    p.join()
    print('Process end.')

使用multiprocessing模块的Pool类来创建多进程

from multiprocessing import Pool
import os, time, random
def run_task(name):
    print('Task %s (pid=%s) is running...' %(name, os.getpid()))
    time.sleep(random.random()*3)
    print('Task %s end.' %name)
if __name__ == '__main__':
    print('current process %s' %os.getpid())
    p = Pool(processes=3)
    for i in range(5):
        p.apply_async(run_task, args=(i, ))
    print('waiting for all subprocesses done...')
    p.close()
    p.join()
    print('all subprocesses done.')

进程间通信,使用Queue方式完成进程间通信。

from multiprocessing import Process, Queue
import os, time, random

#写数据进程执行的代码
def proc_write(q, urls):
    print('Process(%s) is writing...' %os.getpid())
    for url in urls:
        q.put(url)
        print('Put %s to queue...' %url)
        time.sleep(random.random())
#读数据进程执行的代码
def proc_read(q):
    print('Process(%s) is reading...' %os.getpid())
    while True:
        url = q.get(True)
        print('Get %s from queue.' %url)

if __name__ == '__main__':
    #父进程创建Queue,并传给各个子进程
    q = Queue()
    proc_writer1 = Process(target=proc_write, args=(q, ['url_1', 'url_2', 'url_3']))
    proc_writer2 = Process(target=proc_write, args=(q, ['url_4', 'url_5', 'url_6']))
    proc_reader = Process(target=proc_read, args=(q, ))
    #启动子进程proc_write,写入
    proc_writer1.start()
    proc_writer2.start()
    #启动子进程proc_read,读取
    proc_reader.start()
    #等待proc_writer结束
    proc_writer1.join()
    proc_writer2.join()
    #proc_reader进程里是死循环,无法等待期结束,只能强行终止
    proc_reader.terminate()

使用Pipe方式完成进程间通信

import multiprocessing
import random
import time, os

def proc_send(pipe, urls):
    for url in urls:
        print('Process(%s) send: %s' %(os.getpid(), url))
        pipe.send(url)
        time.sleep(random.random())
def proc_recv(pipe):
    while True:
        print('Process(%s) rev:%s' %(os.getpid(), pipe.recv()))
        time.sleep(random.random())
if __name__ == '__main__':
    pipe = multiprocessing.Pipe()
    p1 = multiprocessing.Process(target=proc_send, args=(pipe[0], ['url_'+str(i) for i in range(10)]))
    p2 = multiprocessing.Process(target=proc_recv, args=(pipe[1], ))
    p1.start()
    p2.start()
    p1.join()
    p2.join()

2.2 多线程

用threading模块创建多线程,第一种方式,把一个函数传入并创建Thread实例,然后调用start方法开始执行。

import random
import time, threading
#新线程执行的代码
def thread_run(urls):
    print('Current %s is running...' % threading.current_thread().name)
    for url in urls:
        print('%s ---->>> %s' % (threading.current_thread().name, url))
        time.sleep(random.random())
    print('%s ended.' % threading.current_thread().name)
print('%s is running...' % threading.current_thread().name)
t1 = threading.Thread(target=thread_run, name='Thread_1', args=(['url_1', 'url_2', 'url_3'],))
t2 = threading.Thread(target=thread_run, name='Thread_2', args=(['url_4', 'url_5', 'url_6'],))
t1.start()
t2.start()
t1.join()
t2.join()
print('%s ended.' % threading.current_thread().name)

第二种方式直接从threading.Thread继承并创建线程类,然后重写__init__方法和run方法。

import random
import threading
import time
class myThread(threading.Thread):
    def __init__(self, name, urls):
        threading.Thread.__init__(self, name=name)
        self.urls = urls

    def run(self):
        print('Current %s is running...' % threading.current_thread().name)
        for url in self.urls:
            print('%s ---->>> %s' % ((threading.current_thread().name), url))
            time.sleep(random.random())
        print('%s ended.' % threading.current_thread().name)
print('%s is running...' % threading.current_thread().name)
t1 = myThread(name='Thread_1', urls=['url_1', 'url_2', 'url_3'])
t2 = myThread(name='Thread_2', urls=['url_4', 'url_5', 'url_6'])
t1.start()
t2.start()
t1.join()
t2.join()
print('%s ended.' % threading.current_thread().name)

线程同步

import threading
mylock = threading.RLock()
num = 0
class myThread(threading.Thread):
    def __init__(self, name):
        threading.Thread.__init__(self, name=name)

    def run(self):
        global num
        while True:
            mylock.acquire()
            print('%s locked, Number: %d' % (threading.current_thread().name, num))
            if num>=4:
                mylock.release()
                print('%s released, Number: %d' % (threading.current_thread().name, num))
                break
            num += 1
            print('%s released, Number: %d' % (threading.current_thread().name, num))
            mylock.release()
if __name__ == '__main__':
    thread1 = myThread('Thread_1')
    thread2 = myThread('Thread_2')
    thread1.start()
    thread2.start()

2.3 协程

协程,是一种轻量级线程,gevent是一个基于协程的Python网络函数库,比较完善地提供了协程的支持。

from gevent import monkey;monkey.patch_all()
import gevent
import urllib2

def run_task(url):
    print('Visit --> %s' % url)
    try:
        response = urllib2.urlopen(url)
        data = response.read()
        print('%d bytes received from %s.' % (len(data), url))
    except Exception as e:
        print(e)

if __name__ == '__main__':
    urls = ['https://github.com/', 'https://www.python.org/', 'https://www.cnblogs.com/']
    greenlets = [gevent.spawn(run_task, url) for url in urls]
    gevent.joinall(greenlets)

使用gevent中的pool对象,对动态数量的greenlet进行并发管理

from gevent import monkey
monkey.patch_all()
import urllib2
from gevent.pool import Pool


def run_task(url):
    print('Visit --> %s' % url)
    try:
        response = urllib2.urlopen(url)
        data = response.read()
        print('%d bytes received from %s.' % (len(data), url))
    except Exception as e:
        print(e)
    return 'rul:%s --->finish'% url
  
if __name__ == '__main__':
    pool = Pool(2)
    urls = ['https://github.com/', 'https://www.python.org/', 'http://www.cnblogs.com/']
    results = pool.map(run_task,urls)
    print(results)

2.4 分布式进程

分布式进程指的是将Process进程分布到多台机器上。
首先创建服务进程:

import Queue
from multiprocessing.managers import baseManager
from mulitprocessing import freeze_support

#Task number and define receive/send queue
task_number = 10
task_queue = Queue.Queue(task_number)
result_queue = Queue.Queue(task_number)

def get_task():
    return task_queue
  
def get_result():
    return result_queue

#Create QueueManager class
class QueueManager(BaseManager):
    pass
def win_run():
    QueueManager.register('get_task_queue', callable = get_task)
    QueueManager.register('get_result_queue', callable = get_result)
    
    #Bind port and setup validation token
    manager = QueueManager(address = ('127.0.0.1',8001), authkey = 'enterprise')
    
    #Initiate
    manager.start()
    try:
        #Access task queue and result queue through network
        task = manager.get_task_queue()
        result = manager.get_result_queue()
        
        #Add task
        for url in ['ImageUrl_'+str(i) for i in range(10)]:
            print('Put task %s ...' % url)
            task.put(url)
        print('try get result')
        for i in range(10):
            print('result is %s ' % result.get(timeout=10))
    except:
        pritn('Manager error')
    finally:
        #Must close, or there will be an error
        manager.shutdown()
if __name__ == '__main__':
    freeze_support()
    win_run()

其次,创建任务进程:

improt time
from multiprocessing.managers import BaseManager


#Create QueueManager
class QueueManager(BaseManager):
    pass
  
#Use QueueManager to register
QueueManager.register('get_task_queue')
QueueManager.register('get_result_queue')

#Connect to the server
server_addr = '127.0.0.1'
print('Connect to server %s...' % server_addr)

#Port and validation token should be the same as the taskManager
m = QueueManager(address=(server_addr, 8001), authkey='enterprise')

m.connect()

#Access Queue object
task = m.get_task_queue()
result = m.get_result_queue()

#Access task from the queue and write into the result queue
while(not task.empty()):
    image_url = task.get(True, timeout=5)
    print('run task download %s...' % image_url)
    time.sleep(1)
    result.put('%s--->success' % image_url)

print('worker exit.')

3 网络编程

Socket是网络编程的一个抽象概念,Python提供了两个基本Socket模块,分别是Socket和SocketServer。

3.1 TCP编程

TCP是一种面向连接的通信方式。下面演示创建TCP服务端:

import socket
import threading
import time


def dealClient(sock, addr):
    #Receive data and send out
    print('Accept new connection from %s:%s...' % addr)
    sock.send(b'Hello, I am server')
    while True:
        data = sock.recv(1024)
        time.sleep(1)
        if not data or data.decode('utf-8') == 'exit':
            break
        print('--->>%s!' % data.decode('utf-8'))
        sock.send(('Loop_Msg: %s!' % data.decode('utf-8')).encode('utf-8'))

    #Close Socket
    sock.close()
    print('Connection from %s:%s closed.' % addr)

if __name__ == '__main__':
    #Create a Socket based on IPv4 and TCP protocol
    #Socket binds IP(127.0.0.1) and port
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.bind(('127.0.0.1', 9999))

    #Listen
    s.listen(5)
    print('Waiting for connectoin...')
    while True:
        #Receive a new connection
        sock, addr = s.accept()
        #Create a new thread to deal with TCP connection
        t = threading.Thread(target=dealClient, args=(sock, addr))
        t.start()

TCP客户端:

import socket
#Initialize Socket
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
#Connect to the target IP and port
s.connect(('127.0.0.1', 9999))
#Receive message
print('--->>'+s.recv(1024).decode('utf-8'))
#Send message
s.send(b'Hello, I am a client')
print('--->>'+s.recv(1024).decode('utf-8'))
s.snend(b'exit')
#Close Socket
s.close()

3.2 UDP编程

UDP是面向无连接的协议,只需要知道对方的IP地址和端口号,就可以直接发数据报。先创建服务端:

import socket


s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
s.bind(('127.0.0.1', 9999))
print('Bind UDP on 9999...')
while True:
    data, addr = s.recvfrom(1024)
    print('Received from %s:%s.' % addr)
    s.sendto(b'Hello, %s!' % data, addr)

再创建客户端

import socket


s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
for data in [b'Hello', b'World']:
    s.sendto(data, ('127.0.0.1', 9999))
    print(s.recv(1024).decode('utf-8'))
s.close()

你可能感兴趣的:(爬虫)