selenium+beautifulsoup模拟翻页

#coding=utf-8

import unittest
from selenium import webdriver
from bs4 import BeautifulSoup

class douyuSelenium(unittest.TestCase):
    def setUp(self):
        self.driver = webdriver.PhantomJS()

    def testDouyu(self):
        self.driver.get('http://www.douyu.com/directory/all')
        while True:
            #print self.driver.page_source
            soup = BeautifulSoup(self.driver.page_source, "html.parser")

            titles = soup.select('h3.ellipsis')
            nums = soup.select('span.dy-num.fr')

            for title, num in zip(nums, titles):
                print u"观众人数:"+ title.get_text().strip(), u"\t房间标题:"+num.get_text().strip()

            if self.driver.page_source.find('shark-pager-disable-next') != -1:
                break

            self.driver.find_element_by_class_name('shark-pager-next').click()

    def tearDown(self):
        print 'finish load ...'
        self.driver.quit()

if __name__ == '__main__':
    unittest.main()

scrapy 模拟登录

# -*- coding: utf-8 -*-
import scrapy



class Renren2Spider(scrapy.Spider):
    name = "renren2"
    allowed_domains = ["renren.com"]
    start_urls = (
        "http://www.renren.com/PLogin.do",
    )

    # 处理start_urls里的登录url的响应内容,提取登陆需要的参数(如果需要的话)
    def parse(self, response):
        # 提取登陆需要的参数
        #_xsrf = response.xpath("//_xsrf").extract()[0]

        # 发送请求参数,并调用指定回调函数处理
        yield scrapy.FormRequest.from_response(
                response,
                formdata = {"email" : "[email protected]", "password" : "axxxxxxxe"},#, "_xsrf" = _xsrf},
                callback = self.parse_page
            )

    # 获取登录成功状态,访问需要登录后才能访问的页面
    def parse_page(self, response):
        url = "http://www.renren.com/422167102/profile"
        yield scrapy.Request(url, callback = self.parse_newpage)

    # 处理响应内容
    def parse_newpage(self, response):
        with open("xiao.html", "w") as filename:
            filename.write(response.body)

 

你可能感兴趣的:(小练手)