爬取奇迹秀工具箱里面的文本和软件网盘链接

1.爬取的网址:http://www.qijishow.com/down/index.html
爬取奇迹秀工具箱里面的文本和软件网盘链接_第1张图片
2.完整代码展示

from selenium import webdriver
import time

url = 'http://www.qijishow.com/down/index.html'
opt = webdriver.ChromeOptions()
opt.add_argument("--headless")
driver = webdriver.Chrome(chrome_options=opt)
# driver = webdriver.Chrome()
driver.get(url)
k = driver.find_elements_by_class_name("sm-6")

for i in range(1, len(k) + 1):
    print(i)
    # try:
    print("---------------开始----------------------")
    a = driver.find_element_by_xpath(
        f'//*[@id="page"]/div[4]/div[1]/div[3]/div/div[{i}]/div/a/div[1]/img').get_attribute("data-src")
    a1 = 'http://www.qijishow.com/down/' + str(a)
    b = driver.find_element_by_xpath(f'//*[@id="page"]/div[4]/div[1]/div[3]/div/div[{i}]/div/a/div[2]').text
    print("软件图片地址链接:", a1)
    print("软件名字:", b)
    time.sleep(2)
    # driver.find_element_by_xpath(f'//*[@id="page"]/div[4]/div[1]/div[3]/div/div[{i}]/div/a/div[1]/img').click()

    ele=driver.find_element_by_xpath(f'//div[@class="row"]/div[{i}]')
    driver.execute_script("arguments[0].scrollIntoView()", ele)
    ele.click()
    print("11111111111")
    time.sleep(2)
    windows = driver.window_handles
    driver.switch_to.window(windows[-1])
    p = driver.find_element_by_xpath('//*[@id="resources"]').text
    print(p)
    u = driver.find_elements_by_xpath('//img')
    src = []
    for j in u:
        o = j.get_property('src')
        src.append(o)
        print('文章图片地址:', o)
    r = driver.find_element_by_xpath('//*[@id="download"]/a[@id="local"]').get_attribute('href')
    print("官网地址:", r)
    try:
        x = driver.find_element_by_xpath('//*[@id="zoom_download"]/div[2]/p').text
        y = driver.find_element_by_xpath('//*[@id="zoom_download"]/div[2]/a[1]').get_attribute('href')
        print(x, y)
    except:
        x = "没有"
        y = "没有"
    try:
        z = driver.find_element_by_xpath('//*[@id="zoom_download"]/div[3]/p').text
        h = driver.find_element_by_xpath('//*[@id="zoom_download"]/div[3]/a[1]').get_attribute('href')
        print(z, h)
    except:
        z = "没有"
        h = '没有'
    with open("爬虫/{}.txt".format(i), "w", encoding="utf-8") as f:
        f.write("软件图片地址链接:" + a1 + '\n')
        f.write("软件名字:" + b + '\n')
        f.write(p + '\n')
        f.write('文章图片地址:' + str(src) + '\n')
        f.write("官网地址:" + r + '\n')
        f.write(x + "Mac版百度网盘地址:" + y + '\n')
        f.write(z + "Win版百度网盘地址:" + h)
    driver.close()
    driver.switch_to.window(windows[0])
    print("---------------结束----------------------")
    # except:
    #     pass

3.爬取的文档展示

爬取奇迹秀工具箱里面的文本和软件网盘链接_第2张图片
4.爬取并保存为表格完整代码

from selenium import webdriver
import time
import csv
url = 'http://www.qijishow.com/down/index.html'
opt = webdriver.ChromeOptions()
# opt.add_argument("--headless")
# driver = webdriver.Chrome(chrome_options=opt)
driver = webdriver.Chrome()
driver.get(url)
k = driver.find_elements_by_class_name("sm-6")
with open("交互设计.csv", "w", newline="", encoding="utf-8-sig") as datacsv:
    csvwriter = csv.writer(datacsv, dialect=("excel"))
    csvwriter.writerow(["软件图片地址链接", "软件名字", "富文本", "官网地址", "Mac版下载", "Win版下载"])
    for i in range(1, len(k) + 1):
        l=[]
        print(i)
        try:
            print("---------------开始----------------------")
            a = driver.find_element_by_xpath(
                f'//*[@id="page"]/div[4]/div[5]/div[3]/div/div[{i}]/div/a/div[1]/img').get_attribute("data-src")
            a1 = 'http://www.qijishow.com/down/' + str(a)
            b = driver.find_element_by_xpath(f'//*[@id="page"]/div[4]/div[5]/div[3]/div/div[{i}]/div/a/div[2]').text
            print("软件图片地址链接:", a1)
            print("软件名字:", b)
            l.append(a1)
            l.append(b)
            # driver.find_element_by_xpath(f'//*[@id="page"]/div[4]/div[1]/div[3]/div/div[{i}]/div/a/div[1]/img').click()
            ele=driver.find_element_by_xpath(f'//*[@id="page"]/div[4]/div[5]/div[3]/div/div[{i}]/div/a/div[1]/img')
            driver.execute_script("arguments[0].scrollIntoView()", ele)
            time.sleep(2)
            ele.click()
            windows = driver.window_handles
            driver.switch_to.window(windows[-1])
            p = driver.find_element_by_xpath('//*[@id="resources"]').get_attribute('outerHTML')
            print(p)
            l.append(p)
            r = driver.find_element_by_xpath('//*[@id="download"]/a[@id="local"]').get_attribute('href')
            print("官网地址:", r)
            l.append(r)
            try:
                x = driver.find_element_by_xpath('//*[@id="zoom_download"]/div[2]/p').text
                y = driver.find_element_by_xpath('//*[@id="zoom_download"]/div[2]/a[1]').get_attribute('href')
                print(x, y)
                l.append(x+" "+y)
            except:
                x = "没有"
                y = "没有"
                l.append(x+" "+y)
            try:
                z = driver.find_element_by_xpath('//*[@id="zoom_download"]/div[3]/p').text
                h = driver.find_element_by_xpath('//*[@id="zoom_download"]/div[3]/a[1]').get_attribute('href')
                print(z, h)
                l.append(z+" "+h)
            except:
                z = "没有"
                h = '没有'
                l.append(z+" "+h)
            csvwriter.writerow(l)
            driver.close()
            driver.switch_to.window(windows[0])
            print("---------------结束----------------------")
        except:
            pass

5.爬取的表格展示
爬取奇迹秀工具箱里面的文本和软件网盘链接_第3张图片

你可能感兴趣的:(笔记,爬虫,python,windows)