如何扩展关键词,以及使用python多线程爬取bing搜索结果

帮朋友在互联网推广产品,关键词基数比较少,准备扩展一些关键词,我的思路是这样
1.准备一些基本关键词,使用bing搜索
2.将bing搜索结果标题保存下来

1.准备基本相关关键词

将两个关键词循环遍历合成一个关键词

canche_keys = open('base.txt', 'r', encoding='utf-8')
for key in canche_keys:
    tianjia_keys = open('添加.txt', 'r', encoding='utf-8')
    for t_key in tianjia_keys:
        new_key = key.strip()+t_key.strip()
        print(new_key)

2.分析bing搜索规律

base_url = 'https://www.bing.com/search?q=Where+to+buy+the+mobile+food+truck'
# 第二页
url1 = 'https://www.bing.com/searchq=Where+to+buy+the+mobile+food+truck&first=13&FORM=PERE'
url2 = 'https://www.bing.com/search?q=Where+to+buy+the+mobile+food+truck&first=27&FORM=PERE'

3.根据关键词生成bing base_url

import re


def get_bing_url(keywords):
    keywords = keywords.strip('\n')
    bing_url = re.sub(r'^', 'https://www.bing.com/search?q=', keywords)
    bing_url = re.sub(r'\s', '+', bing_url)
    return bing_url


if __name__ == '__main__':
    bing_url = get_bing_url('Where to buy the mobile food truck')
    print(bing_url)

4.爬取bing结果

import re
import requests
from lxml.html import etree


def get_bing_url(keywords):
    keywords = keywords.strip('\n')
    bing_url = re.sub(r'^', 'https://cn.bing.com/search?q=', keywords)
    bing_url = re.sub(r'\s', '+', bing_url)
    return bing_url


if __name__ == '__main__':
    bing_url = get_bing_url('Where to buy the mobile food truck')

    # proxies = {'http': 'http://127.0.0.1:10808', 'https': 'https://127.0.0.1:10808'}

    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0',
               'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
               'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
               'Accept-Encoding': 'gzip, deflate',
               'cookie': 'DUP=Q=sBQdXP4Rfrv4P4CTmxe4lQ2&T=415111783&A=2&IG=31B594EB8C9D4B1DB9BDA58C6CFD6F39; MUID=196418ED32D66077102115A736D66479; SRCHD=AF=NOFORM; SRCHUID=V=2&GUID=DDFFA87D3A894019942913899F5EC316&dmnchg=1; ENSEARCH=BENVER=1; _HPVN=CS=eyJQbiI6eyJDbiI6MiwiU3QiOjAsIlFzIjowLCJQcm9kIjoiUCJ9LCJTYyI6eyJDbiI6MiwiU3QiOjAsIlFzIjowLCJQcm9kIjoiSCJ9LCJReiI6eyJDbiI6MiwiU3QiOjAsIlFzIjowLCJQcm9kIjoiVCJ9LCJBcCI6dHJ1ZSwiTXV0ZSI6dHJ1ZSwiTGFkIjoiMjAyMC0wMy0xNlQwMDowMDowMFoiLCJJb3RkIjowLCJEZnQiOm51bGwsIk12cyI6MCwiRmx0IjowLCJJbXAiOjd9; ABDEF=V=13&ABDV=11&MRNB=1614238717214&MRB=0; _RwBf=mtu=0&g=0&cid=&o=2&p=&c=&t=0&s=0001-01-01T00:00:00.0000000+00:00&ts=2021-02-25T07:47:40.5285039+00:00&e=; MUIDB=196418ED32D66077102115A736D66479; SerpPWA=reg=1; SRCHUSR=DOB=20190509&T=1614253842000&TPC=1614238646000; _SS=SID=375CD2D8DA85697D0DA0DD31DBAB689D; _EDGE_S=SID=375CD2D8DA85697D0DA0DD31DBAB689D&mkt=zh-cn; _FP=hta=on; SL_GWPT_Show_Hide_tmp=1; SL_wptGlobTipTmp=1; dsc=order=ShopOrderDefault; ipv6=hit=1614260171835&t=4; SRCHHPGUSR=CW=993&CH=919&DPR=1&UTC=480&WTS=63749850642&HV=1614256571&BRW=HTP&BRH=M&DM=0'

               }

    for i in range(1, 3):  # 通过for in来翻页
        if i == 1:
            url = bing_url
        else:
            url = bing_url + '&qs=ds&first=' + str((i * 10) - 1) + '&FORM=PERE'
        print(url)
        content = requests.get(url=url, timeout=5, headers=headers)
        tree = etree.HTML(content.text)
        # print(content.text)
        li_list = tree.xpath('//ol[@id="b_results"]//li[@class="b_algo"]')
        for li in li_list:

            try:
                h3 = li.xpath('./h2/a')[0]
                h3 = h3.xpath('string(.)')
                p = li.xpath('.//p')[0]
                p = p.xpath('string(.)')
                print(h3)
                print(p)
                print('=======================')
            except Exception:
                pass

5.存储到数据库中

import re
import os
import requests
from lxml.html import etree

import django

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "canche.settings")
django.setup()

from keywords_en.models import KeywordsSection, KeyWords, SectionContent


def get_bing_url(keywords):
    keywords = keywords.strip('\n')
    bing_url = re.sub(r'^', 'https://cn.bing.com/search?q=', keywords)
    bing_url = re.sub(r'\s', '+', bing_url)
    return bing_url


if __name__ == '__main__':
    keys = KeyWords.objects.all()[72:]
    for k in keys:
        bing_url = get_bing_url(k.keywords)


        # proxies = {'http': 'http://127.0.0.1:10808', 'https': 'https://127.0.0.1:10808'}

        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0',
                   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                   'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
                   'Accept-Encoding': 'gzip, deflate',
                   'cookie': 'DUP=Q=sBQdXP4Rfrv4P4CTmxe4lQ2&T=415111783&A=2&IG=31B594EB8C9D4B1DB9BDA58C6CFD6F39; MUID=196418ED32D66077102115A736D66479; SRCHD=AF=NOFORM; SRCHUID=V=2&GUID=DDFFA87D3A894019942913899F5EC316&dmnchg=1; ENSEARCH=BENVER=1; _HPVN=CS=eyJQbiI6eyJDbiI6MiwiU3QiOjAsIlFzIjowLCJQcm9kIjoiUCJ9LCJTYyI6eyJDbiI6MiwiU3QiOjAsIlFzIjowLCJQcm9kIjoiSCJ9LCJReiI6eyJDbiI6MiwiU3QiOjAsIlFzIjowLCJQcm9kIjoiVCJ9LCJBcCI6dHJ1ZSwiTXV0ZSI6dHJ1ZSwiTGFkIjoiMjAyMC0wMy0xNlQwMDowMDowMFoiLCJJb3RkIjowLCJEZnQiOm51bGwsIk12cyI6MCwiRmx0IjowLCJJbXAiOjd9; ABDEF=V=13&ABDV=11&MRNB=1614238717214&MRB=0; _RwBf=mtu=0&g=0&cid=&o=2&p=&c=&t=0&s=0001-01-01T00:00:00.0000000+00:00&ts=2021-02-25T07:47:40.5285039+00:00&e=; MUIDB=196418ED32D66077102115A736D66479; SerpPWA=reg=1; SRCHUSR=DOB=20190509&T=1614253842000&TPC=1614238646000; _SS=SID=375CD2D8DA85697D0DA0DD31DBAB689D; _EDGE_S=SID=375CD2D8DA85697D0DA0DD31DBAB689D&mkt=zh-cn; _FP=hta=on; SL_GWPT_Show_Hide_tmp=1; SL_wptGlobTipTmp=1; dsc=order=ShopOrderDefault; ipv6=hit=1614260171835&t=4; SRCHHPGUSR=CW=993&CH=919&DPR=1&UTC=480&WTS=63749850642&HV=1614256571&BRW=HTP&BRH=M&DM=0'

                   }

        for i in range(1, 6):  # 通过for in来翻页
            if i == 1:
                url = bing_url
            else:
                url = bing_url + '&qs=ds&first=' + str((i * 10) - 1) + '&FORM=PERE'
            print(url)
            content = requests.get(url=url, timeout=5, headers=headers)
            tree = etree.HTML(content.text)
            # print(content.text)
            li_list = tree.xpath('//ol[@id="b_results"]//li[@class="b_algo"]')
            for li in li_list:
                try:
                    h3 = li.xpath('./h2/a')[0]
                    h3 = h3.xpath('string(.)')
                    p = li.xpath('.//p')[0]
                    p = p.xpath('string(.)')
                    keywordssection = KeywordsSection(section=h3)
                    keywordssection.save()
                    keywordssection.keywords.add(k)
                    keywordssection.save()
                    print(keywordssection.section)
                    sectioncontent = SectionContent(content=p)
                    sectioncontent.save()
                    sectioncontent.keywordssection.add(keywordssection)
                    sectioncontent.save()
                    print(sectioncontent.content)
                    print('=============================')
                except Exception:
                    pass

你可能感兴趣的:(爬虫,python,xpath,搜索引擎,爬虫,python,seo)