1.掌握python,爬虫的相关知识
2.开始实践,环境准备(安装googletrans,request,beautifulsoup库)
3.选取目标 - 谷歌,金山词霸
4.上代码
from googletrans import Translator
import requests, re
from bs4 import BeautifulSoup
import sys
# anthor : Comiii
# 2018/12/4
# purpose : university competition
class Tranlate():
Result = ""
# 谷歌库 -- 有其他人写好的一个库文件,直接引用
def __init__(self, text, flag):
translator = Translator()
if (flag == 1): # 中文
result = translator.translate(text, dest="EN")
# print(result.text)
elif (flag == 2): # 英语
result = translator.translate(text, dest="zh-CN")
# print(result.text)
elif (flag == 3): # 日语
result = translator.translate(text, dest="ja")
elif(flag == 4):
result = translator.translate(text, dest="zh-CN")
self.Result = result.text
class Spider():
# 爬取金山词霸
Result = ''
Soup = ''
def __init__(self, KWord):
# url="http://www.youdao.com/w/"+KWord+"/#keyfrom=dict2.top" 有道词典
url = "http://www.iciba.com/" + KWord # 金山词霸,所有查找类型一个形式
bs = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}
r = requests.get(url, headers=bs, timeout=60)
self.Result = r.text
self.CheckBeautifulsoup()
# 不总是有效 into 有道词典,useful into 金山词霸
# beautifulsoup
def CheckBeautifulsoup(self):
soup = BeautifulSoup(self.Result, "html.parser")
# 词霸翻译器爬取
try:
# 你在说什么?
for div in soup.find_all(name='div', style="width: 580px; margin-top: 15px; font-size: 18px; line-height: 24px; color: #333333;"):
soup = div.find(text=True).strip()
self.Soup = soup
except:
print(" ")
# 网页爬取
try:
for li in soup.find_all(name='li', attrs='clearfix'):
for span in li.find_all(name='span'):
soup = span.find(text=True).strip()
# print(soup) # 多个结果,全部显示,最终使用
self.Soup = soup
# 单个结果,可能翻译对象不对
# print(soup)
except:
print("")
# 不想写了怎么办呜呜 .QAQ .
# .. ..
# .. ...
# re正则表达式
def CheckRe(self):
pattern = re.compile(r'')
if __name__ == "__main__":
text = "你好"
flag = 1 # 不同flag 代表不同的翻译类别
# print(text)
# print(flag)
tra = Tranlate(text,flag) #google
print(tra.Result)
kingt = Spider(text) # 金山词霸,会出现有结果却不显示的问题
print(kingt.Soup)
5.总结:太简单了,一个轻量级网页爬取,没什么好总结的!