爬虫作业四

爬取免费的代理ip。写入数据库

①获取IP

from bs4 import  BeautifulSoup
from selenium import webdriver
from lxml import etree
import time
#通过executable_path参数指定Firefox驱动文件所在位置
driver = webdriver.Firefox(executable_path="F:\python\\geckodriver")
driver.maximize_window()
t1 = time.time()
driver.get("http://h.zhimaruanjian.com/?utm-source=bdtg&utm-keyword=?6122")
# C = driver.find_element_by_css_selector('[id="ip_list"]').text #获取指定代码
# print(C)
search_results=[]
i=2
while i < 8:
    result=(driver.find_element_by_xpath('/html/body/div[8]/div/div/div[1]/div[1]/table/tbody/tr['+str(i)+']/td[1]').text)
    search_results.append(result)
    i= i +1
#print(search_results)


# for i in search_results:
#     #print(len(i))
#     print(i[4:])
    #print(C)
driver.find_element_by_xpath("/html/body/div[8]/div/div/div[1]/div[2]/div/a[6]").click()
T=2
while T < 51:
    i = 2
    while i < 8:
        result = (driver.find_element_by_xpath('/html/body/div[8]/div/div/div[1]/div[1]/table/tbody/tr[' + str(i) + ']/td[1]').text)
        search_results.append(result)
        i = i + 1
    #print(search_results)

        time.sleep(0.1)
    if T < 5:
        driver.find_element_by_xpath("/html/body/div[8]/div/div/div[1]/div[2]/div/a[7]").click()
    else:
        driver.find_element_by_xpath("/html/body/div[8]/div/div/div[1]/div[2]/div/a[8]").click()
    T = T + 1
print('总耗时'+str(time.time() - t1)+'秒')
S= len(search_results)
print('共获取到 '+str(S)+'个ip')
IP_list=[]
for i in search_results:
    a=i[4:]
    print(i[4:])
    IP_list.append(a)
print(IP_list)
print(len(IP_list))
#print(len(search_results))


# tree = etree.HTML(C)
# hot_li_list = tree.xpath('/html/body/div[8]/div/div/div[1]/div[1]/table/tbody/tr[3]/td[1]')


#td_list = soup.find_all('td',)

# print(td_list)
# for i in td_list:
#     print(i.string)
driver.quit()

②测试数据,连接数据库,写入数据库

import pymysql.cursors
listall=['119.7.231.93', '111.126.77.211', '110.52.224.108', '182.114.26.144', '125.87.84.156', '122.232.230.146', '36.6.135.250', '27.157.247.251', '123.156.180.120', '113.141.223.165', '60.166.170.94', '42.248.68.8']
def getpsw(psw_file):
    with open(psw_file, "r") as f:
        psw = f.readline()
    f.close()
    return psw
psw_file='E:\\psw.txt'
psw=getpsw(psw_file)
connection = pymysql.connect(host='127.0.0.1',
                           port=3306,
                           user='root',
                           password=psw,
                           db='mysql',
                           charset="utf8mb4")
try:
    cursor=connection.cursor()
    cursor.execute("Create Database If Not Exists IP Character Set UTF8")
    print('创建IP数据库成功')

    # sql = "select * from person"
    # cursor.execute(sql)
    # result = cursor.fetchall()
    # for data in result:
    #     print(data)
except Exception:print("失败")
sql = "create table 司家璇 (ID char(255))"
try:
    cursor.execute("USE IP" )
    cursor.execute('DROP TABLE IF EXISTS 司家璇')
    cursor.execute(sql)
    print('建表成功')
except Exception:print("建表失败")
sql01 = "insert into 司家璇 (ID) values (%s)"
for i  in listall:
    cc=str(i)
    print(cc)
    cursor.execute(sql01,cc)
    cursor.connection.commit()

cursor.close()
connection.close()

③合并两端代码

from bs4 import  BeautifulSoup
from selenium import webdriver
from lxml import etree
import pymysql.cursors
import time
#通过executable_path参数指定Firefox驱动文件所在位置
print("请输一个大于0的非负整数,第57行中的地址改成自己存放数据库password的.txt的地址")
N = input("输入页数(每页六个,请输入整数):")
N = int(N)+1
#print(N)
driver = webdriver.Firefox(executable_path="F:\python\\geckodriver")
driver.maximize_window()
t1 = time.time()
driver.get("http://h.zhimaruanjian.com/?utm-source=bdtg&utm-keyword=?6122")
js="var q=document.documentElement.scrollTop=2333" #拖动页面
driver.execute_script(js)
search_results=[]
T=1
while T < N:
    i = 2
    while i < 8:
        result = (driver.find_element_by_xpath('/html/body/div[8]/div/div/div[1]/div[1]/table/tbody/tr[' + str(i) + ']/td[1]').text)
        search_results.append(result)
        i = i + 1
    #print(search_results)
    if T < 2:
        driver.find_element_by_xpath("/html/body/div[8]/div/div/div[1]/div[2]/div/a[6]").click()
        T= T+1
    else:
        if T < 5:
            driver.find_element_by_xpath("/html/body/div[8]/div/div/div[1]/div[2]/div/a[7]").click()
        else:
            driver.find_element_by_xpath("/html/body/div[8]/div/div/div[1]/div[2]/div/a[8]").click()
        T = T + 1

        # time.sleep(0.1)
print('总耗时'+str(time.time() - t1)+'秒')
S= len(search_results)
print('共获取到 '+str(S)+' 个ip')
IP_list=[]
for i in search_results:
    a=i[4:]
    print(i[4:])
    IP_list.append(a)
print(IP_list)
print(len(IP_list))

driver.quit()#退出浏览器

SJX=[]
listall=IP_list
def getpsw(psw_file):
    with open(psw_file, "r") as f:
        psw = f.readline()
    f.close()
    return psw
psw_file='E:\\psw.txt'
psw=getpsw(psw_file)
connection = pymysql.connect(host='127.0.0.1',
                           port=3306,
                           user='root',
                           password=psw,
                           db='mysql',
                           charset="utf8mb4")
try:
    cursor=connection.cursor()
    cursor.execute("Create Database If Not Exists IP Character Set UTF8")
    print('创建IP数据库成功')

    # sql = "select * from person"
    # cursor.execute(sql)
    # result = cursor.fetchall()
    # for data in result:
    #     print(data)
except Exception:print("失败")
sql = "create table 司家璇 (IP char(255))"
try:
    cursor.execute("USE IP" )
    cursor.execute('DROP TABLE IF EXISTS 司家璇')
    cursor.execute(sql)
    print('建表成功')
except Exception:print("建表失败")
sql01 = "insert into 司家璇 (IP) values (%s)"
for i  in listall:
    cc=str(i)
    #print(cc)
    SJX.append(cc)
    print("第" + str(len(SJX)) + "个IP写入司家璇")
    cursor.execute(sql01,cc)
    cursor.connection.commit()


cursor.close()
connection.close()

你可能感兴趣的:(爬虫,学习,python,mysql,数据库,selenium)