利用python进行爬虫操作记录——part1百度贴吧

在进行自然语言的深入学习中,很重要的一个过程是从互联网平台上抓取文本和资料。我开始尝试进行网络爬虫。
从最简单的不需要模拟登陆的百度贴吧和豆瓣等开始。

firefox浏览器

相较于chrome浏览器而言,最近发现firefox特别好用,根本不需要什么额外的抓包,就可以实时监测而且可读性很强。
F12调取网络检测器
查看器——查看整个网页的HTML脚本,支持搜索
控制台——get/post请求的参数/响应/cookie
各项参数和记录特别具体

百度贴吧爬虫

# coding=utf-8
# -*- coding = utf-8 -*-
import urllib2
import re


class Tool:
    removeImg = re.compile(r'')  # image
    removeAddr = re.compile(r'|')
    replaceLine = re.compile('|
|
|

'
) replaceBR = re.compile('

|
'
) replaceTD = re.compile('') replacePara = re.compile('') removeExtraTag = re.compile('<.*?>') def replace(self, x): x = re.sub(self.removeImg, "", x) x = re.sub(self.removeAddr, "", x) x = re.sub(self.replaceLine, "\n", x) x = re.sub(self.replaceTD, "\t", x) x = re.sub(self.replacePara, "\n ", x) x = re.sub(self.replaceBR, "\n", x) x = re.sub(self.removeExtraTag, "", x) return x.strip() class BaiduTieba: def __init__(self, url, seelz, floortag=1): self.url = url self.seeLz = '?see_lz='+str(seelz) self.tool = Tool() self.file = None self.defaultTitle = "百度贴吧" self.floortag = floortag # 判断是否添加楼层标志 self.floor = 1 def getPageContent(self, pagenum): url = self.url + self.seeLz + '&pn=' + str(pagenum) user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0' headers = {'User-Agent': user_agent} try: request = urllib2.Request(url, headers=headers) response = urllib2.urlopen(request) content = response.read().decode('utf-8') return content except urllib2.URLError, e: if hasattr(e, 'reason'): # 检查是否有name的属性 print e.reason def get_title(self, pagenum=1): content = self.getPageContent(pagenum) pattern_title = re.compile(r'

if title: return title.group(1).strip() else: print None def get_author(self, pagenum=1): content = self.getPageContent(pagenum) pattern_author = re.compile(r'
') author = re.search(pattern_author, content) if author: return author.group(1).strip() else: return None def get_reply_page(self, pagenum=1): content = self.getPageContent(pagenum) pattern_page = re.compile( r'
  • (.*?).*?(.*?)') totalpage = re.search(pattern_page, content) if totalpage: return totalpage.group(1).strip(), totalpage.group(2).strip() else: return None def getContent(self, pagenum): content = self.getPageContent(pagenum) pattern_content = re.compile(r'
    1 contents = [] for item in items: str_floor = str(floor) + u'楼——————————\n' tempContent = '\n'+ self.tool.replace(item)+'\n' contents.append(str_floor) contents.append(tempContent.encode('utf-8')) floor += 1 return contents def writedata2File(self, contents): for item in contents: print u"正在写入"+ str(self.floor) + u"楼的内容" self.file.write(item) self.floor += 1 def newFile(self, title): if title: self.file = open(title + '.txt', 'w+') else: self.file = open(self.defaultTitle + '.txt', 'w+') def start_spider(self, pagenum=1): # 先获得第一页基础信息 content = self.getPageContent(pagenum) title = self.get_title(pagenum) author = self.get_author(pagenum) self.newFile(title) totalpage = self.get_reply_page(pagenum) totalcontent = [] for i in range(1, totalpage[1]+1): tempcontent = self.getContent(i) totalcontent += tempcontent try: self.writedata2File(totalcontent) except IOError, e: print '写入文件发生异常' + e.message finally: print '写入文件完成' # 测试记录
  • tips: 1. re.research().group(0,1,2,3……)
    2.user_agent等信息根据firefox的get请求中的响应参数决定

    你可能感兴趣的:(python)