帮朋友在互联网推广产品,关键词基数比较少,准备扩展一些关键词,我的思路是这样
1.准备一些基本关键词,使用bing搜索
2.将bing搜索结果标题保存下来
将两个关键词循环遍历合成一个关键词
canche_keys = open('base.txt', 'r', encoding='utf-8')
for key in canche_keys:
tianjia_keys = open('添加.txt', 'r', encoding='utf-8')
for t_key in tianjia_keys:
new_key = key.strip()+t_key.strip()
print(new_key)
base_url = 'https://www.bing.com/search?q=Where+to+buy+the+mobile+food+truck'
# 第二页
url1 = 'https://www.bing.com/searchq=Where+to+buy+the+mobile+food+truck&first=13&FORM=PERE'
url2 = 'https://www.bing.com/search?q=Where+to+buy+the+mobile+food+truck&first=27&FORM=PERE'
import re
def get_bing_url(keywords):
keywords = keywords.strip('\n')
bing_url = re.sub(r'^', 'https://www.bing.com/search?q=', keywords)
bing_url = re.sub(r'\s', '+', bing_url)
return bing_url
if __name__ == '__main__':
bing_url = get_bing_url('Where to buy the mobile food truck')
print(bing_url)
import re
import requests
from lxml.html import etree
def get_bing_url(keywords):
keywords = keywords.strip('\n')
bing_url = re.sub(r'^', 'https://cn.bing.com/search?q=', keywords)
bing_url = re.sub(r'\s', '+', bing_url)
return bing_url
if __name__ == '__main__':
bing_url = get_bing_url('Where to buy the mobile food truck')
# proxies = {'http': 'http://127.0.0.1:10808', 'https': 'https://127.0.0.1:10808'}
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate',
'cookie': 'DUP=Q=sBQdXP4Rfrv4P4CTmxe4lQ2&T=415111783&A=2&IG=31B594EB8C9D4B1DB9BDA58C6CFD6F39; MUID=196418ED32D66077102115A736D66479; SRCHD=AF=NOFORM; SRCHUID=V=2&GUID=DDFFA87D3A894019942913899F5EC316&dmnchg=1; ENSEARCH=BENVER=1; _HPVN=CS=eyJQbiI6eyJDbiI6MiwiU3QiOjAsIlFzIjowLCJQcm9kIjoiUCJ9LCJTYyI6eyJDbiI6MiwiU3QiOjAsIlFzIjowLCJQcm9kIjoiSCJ9LCJReiI6eyJDbiI6MiwiU3QiOjAsIlFzIjowLCJQcm9kIjoiVCJ9LCJBcCI6dHJ1ZSwiTXV0ZSI6dHJ1ZSwiTGFkIjoiMjAyMC0wMy0xNlQwMDowMDowMFoiLCJJb3RkIjowLCJEZnQiOm51bGwsIk12cyI6MCwiRmx0IjowLCJJbXAiOjd9; ABDEF=V=13&ABDV=11&MRNB=1614238717214&MRB=0; _RwBf=mtu=0&g=0&cid=&o=2&p=&c=&t=0&s=0001-01-01T00:00:00.0000000+00:00&ts=2021-02-25T07:47:40.5285039+00:00&e=; MUIDB=196418ED32D66077102115A736D66479; SerpPWA=reg=1; SRCHUSR=DOB=20190509&T=1614253842000&TPC=1614238646000; _SS=SID=375CD2D8DA85697D0DA0DD31DBAB689D; _EDGE_S=SID=375CD2D8DA85697D0DA0DD31DBAB689D&mkt=zh-cn; _FP=hta=on; SL_GWPT_Show_Hide_tmp=1; SL_wptGlobTipTmp=1; dsc=order=ShopOrderDefault; ipv6=hit=1614260171835&t=4; SRCHHPGUSR=CW=993&CH=919&DPR=1&UTC=480&WTS=63749850642&HV=1614256571&BRW=HTP&BRH=M&DM=0'
}
for i in range(1, 3): # 通过for in来翻页
if i == 1:
url = bing_url
else:
url = bing_url + '&qs=ds&first=' + str((i * 10) - 1) + '&FORM=PERE'
print(url)
content = requests.get(url=url, timeout=5, headers=headers)
tree = etree.HTML(content.text)
# print(content.text)
li_list = tree.xpath('//ol[@id="b_results"]//li[@class="b_algo"]')
for li in li_list:
try:
h3 = li.xpath('./h2/a')[0]
h3 = h3.xpath('string(.)')
p = li.xpath('.//p')[0]
p = p.xpath('string(.)')
print(h3)
print(p)
print('=======================')
except Exception:
pass
import re
import os
import requests
from lxml.html import etree
import django
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "canche.settings")
django.setup()
from keywords_en.models import KeywordsSection, KeyWords, SectionContent
def get_bing_url(keywords):
keywords = keywords.strip('\n')
bing_url = re.sub(r'^', 'https://cn.bing.com/search?q=', keywords)
bing_url = re.sub(r'\s', '+', bing_url)
return bing_url
if __name__ == '__main__':
keys = KeyWords.objects.all()[72:]
for k in keys:
bing_url = get_bing_url(k.keywords)
# proxies = {'http': 'http://127.0.0.1:10808', 'https': 'https://127.0.0.1:10808'}
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate',
'cookie': 'DUP=Q=sBQdXP4Rfrv4P4CTmxe4lQ2&T=415111783&A=2&IG=31B594EB8C9D4B1DB9BDA58C6CFD6F39; MUID=196418ED32D66077102115A736D66479; SRCHD=AF=NOFORM; SRCHUID=V=2&GUID=DDFFA87D3A894019942913899F5EC316&dmnchg=1; ENSEARCH=BENVER=1; _HPVN=CS=eyJQbiI6eyJDbiI6MiwiU3QiOjAsIlFzIjowLCJQcm9kIjoiUCJ9LCJTYyI6eyJDbiI6MiwiU3QiOjAsIlFzIjowLCJQcm9kIjoiSCJ9LCJReiI6eyJDbiI6MiwiU3QiOjAsIlFzIjowLCJQcm9kIjoiVCJ9LCJBcCI6dHJ1ZSwiTXV0ZSI6dHJ1ZSwiTGFkIjoiMjAyMC0wMy0xNlQwMDowMDowMFoiLCJJb3RkIjowLCJEZnQiOm51bGwsIk12cyI6MCwiRmx0IjowLCJJbXAiOjd9; ABDEF=V=13&ABDV=11&MRNB=1614238717214&MRB=0; _RwBf=mtu=0&g=0&cid=&o=2&p=&c=&t=0&s=0001-01-01T00:00:00.0000000+00:00&ts=2021-02-25T07:47:40.5285039+00:00&e=; MUIDB=196418ED32D66077102115A736D66479; SerpPWA=reg=1; SRCHUSR=DOB=20190509&T=1614253842000&TPC=1614238646000; _SS=SID=375CD2D8DA85697D0DA0DD31DBAB689D; _EDGE_S=SID=375CD2D8DA85697D0DA0DD31DBAB689D&mkt=zh-cn; _FP=hta=on; SL_GWPT_Show_Hide_tmp=1; SL_wptGlobTipTmp=1; dsc=order=ShopOrderDefault; ipv6=hit=1614260171835&t=4; SRCHHPGUSR=CW=993&CH=919&DPR=1&UTC=480&WTS=63749850642&HV=1614256571&BRW=HTP&BRH=M&DM=0'
}
for i in range(1, 6): # 通过for in来翻页
if i == 1:
url = bing_url
else:
url = bing_url + '&qs=ds&first=' + str((i * 10) - 1) + '&FORM=PERE'
print(url)
content = requests.get(url=url, timeout=5, headers=headers)
tree = etree.HTML(content.text)
# print(content.text)
li_list = tree.xpath('//ol[@id="b_results"]//li[@class="b_algo"]')
for li in li_list:
try:
h3 = li.xpath('./h2/a')[0]
h3 = h3.xpath('string(.)')
p = li.xpath('.//p')[0]
p = p.xpath('string(.)')
keywordssection = KeywordsSection(section=h3)
keywordssection.save()
keywordssection.keywords.add(k)
keywordssection.save()
print(keywordssection.section)
sectioncontent = SectionContent(content=p)
sectioncontent.save()
sectioncontent.keywordssection.add(keywordssection)
sectioncontent.save()
print(sectioncontent.content)
print('=============================')
except Exception:
pass