安装vmWare workstation
添加虚拟机,安装CentOS操作系统
配置网络,编辑网卡配置文件,设置BOOTPROTO="dhcp"
和ONBOOT="YES"
cd / | 切换路径 | |
---|---|---|
cd .. | 回到上一层路径 | |
ls | 查看当前目录下所有文件和文件夹 | |
ls -a | 显示隐藏文件 | |
ls -l | 查看详细信息(ll) | |
mkdir | 创建文件夹 | |
touch | 创建文件 | |
mv /lx /opt/ | 移动(剪切) | |
mv lx xxx | 重命名 | |
cp lx.txt /opt/ | 复制粘贴 | |
vi xxx | 打开文件 | |
i | 进去编辑模式 | |
esc | 退出编辑模式 | |
:w | 写入 | |
:q | 退出 | |
:! | 强制 | |
rm -rf *** | 递归删除文件或者文件夹 | |
r | 递归 | |
f | 强制 | |
pwd | 显示当前所在路径 | |
ll | 查看详细信息 |
Review
配置虚拟机网络
vim /etc/sysconfig/network-scripts/ifcfg-ens33 # 修改ONBOOT=YES
修改主机名
修改主机名不能使用注释
vim /etc/sysconfig/network # 修改HOSTNAME
关闭防火墙
# 查看防火墙状态 firewall-cmd --state # 临时关闭防防火墙 systemctl stop firewalld.service # 禁止防火墙开机自启 systemctl disable firewalld.service
修改hosts
为所有主机建立映射
sudo vim /etc/hosts ## host文件 192.168.183.128 Master 192.168.183.129 slave0 192.168.183.130 slave1 ## 传输给其他主机 sudo scp /etc/hosts hadoop@slave0:/etc/hosts sudo scp /etc/hosts hadoop@slave1:/etc/hosts
测试主机是否连通
ssh-keygen cd ~/.ssh scp id_rsa.pub hadoop@slave0:~/.ssh/master5.pub scp id_rsa.pub hadoop@slave1:~/.ssh/master5.pub ## 登陆slave0和slave1 cd ~/.ssh touch authorized_keys chmod 600 authorized_keys cat master.pub >> authorized_keys
ssh-keygen ssh-copy-id hadoop@slave0 ssh-copy-id hadoop@slave1
验证是否配置成功
ssh hadoop@slave0 ssh hadoop@slave1 ssh hadoop@master
下载jdk
到/opt
目录下
修改opt目录及其子目录权限
sudo chown -R hadoop:hadoop opt/ # 解压jdk tar -zxvf jdk-8u101-linux-x64.tar.gz # 配置当前用户环境变量 sudo vi ~/.bashrc # 追加export export JAVA_HOME=/opt/jdk1.8.0_101 export PATH=$PATH:$JAVA_HOME/bin # 退出文件,使bashrc配置生效 source ~/.bashrc # 测试配置 java -version
为其他主机安装jdk
## 进入其他主机,修改文件夹权限 sudo chown -R hadoop:hadoop /opt ## 回到已经安装jdk的主机 scp -r jdk1.8.0_101/ hadoop@slave1:/opt/ scp -r jdk1.8.0_101/ hadoop@slave0:/opt/ sudo scp ~/.bashrc hadoop@slave1:~/.bashrc sudo scp ~/.bashrc hadoop@slave0:~/.bashrc ## 进入其他主机,使其配置生效 source ~/.bashrc # 测试配置 java -version
下载hadoop
到/opt
目录下
其他操作与安装JDK
类似
# 解压jdk tar -zxvf hadoop-2.6.4.tar.gz # 配置当前用户环境变量 sudo vi ~/.bashrc # 追加export export HADOOP_HOME=/opt/hadoop-2.6.4 export PATH=$PATH:$HADOOP_HOME/bin # 退出文件,使bashrc配置生效 source ~/.bashrc # 测试配置 hadoop -version
为其他主机安装hadoop
tar参数
-z 压缩
-x 解包
-v 输入信息
-f 必须要
-C 指定解包位置
-c 打包
示例
统计文件夹中所有文件以dfs
开头的单词个数
cd /opt mkdir input # 复制hadoop提供的测试文件 cp hadoop-2.6.4/etc/hadoop/*.xml input # 运行统计程序 hadoop jar hadoop-2.6.4/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.6.4.jar grep input/ output 'dfs[a-z.]+' # 查看输出 cat output/*
伪分布式模式配置
cd /opt/hadoop-2.6.4/ vim etc/hadoop/core-site.xml # 在configuration标签内添加内容## 编辑hdfs-site.xml vim etc/hadoop/hdfs-site.xml # 在configuration标签内添加内容 fs.defaultFS hdfs://localhost:9000 dfs.replication 1
格式化文件系统
bin/hdfs namenode -format
输出status
状态为0
则格式化成功
启动NameNode
和DataNode
进程
sbin/start-dfs.sh
访问浏览器,默认地址NameNode - http://localhost:50070/
常用hdfs
操作
以词频统计示例为例
cd /opt/hadoop-2.6.4 # 为hdfs命令创建软链接(hdfs源码中使用相对路径,可能导致无法正常使用链接) ln -s /opt/hadoop-2.6.4/bin/hdfs /usr/local/bin # 1. 创建文件夹 hdfs dfs -mkdir /user # 2. 上传文件到input文件夹 hdfs dfs -put etc/hadoop/* /input # 3. 运行词频统计示例 bin/hadoop jar share/hadoop/mapreduce/hadoop-mapreduce-examples-2.6.4.jar grep /input /output 'dfs[a-z.]+' # 4. 下载输出结果并查看 hdfs dfs -get /output output cat output/* # 5. 在线查看输出结果 hdfs dfs -cat /output/* # 结束进程 sbin/stop-dfs.sh
YARN on a Single Node
cd /opt/hadoop-2.6.4 cp etc/hadoop/mapred-site.xml.template etc/hadoop/mapred-site.xml vim etc/hadoop/mapred-site.xml ## 添加内容## 保存并退出 vim etc/hadoop/yarn-site.xml ## 添加内容 mapreduce.framework.name yarn ## 保存退出,启动ResourceManager进程和NodeManager进程 sbin/start-yarn.sh # 结束进程 sbin/stop-yarn.sh yarn.nodemanager.aux-services mapreduce_shuffle
访问浏览器默认地址ResourceManager - http://localhost:8088/
Master主机下:
cd /opt/hadoop-2.6.4/etc/hadoop
编辑core-site.xml
,指定namenode为master机器
vim core-site.xml # 配置如下fs.defaultFS hdfs://master:9000
编辑hdfs-site.xml
,指定Namenode存储路径,指定数据节点DataNode存储路径,指定副本数量
vi hdfs-site.xml # 配置如下dfs.namenode.name.dir /opt/dfs/name dfs.datanode.data.dir /opt/dfs/data dfs.replication 2
编辑mapred-site.xml
vim mapred-site.xml # 配置如下mapred.job.tracker master:9001 mapreduce.framework.name yarn mapreduce.jobhistory.address master:10020
编辑yarn-site.xml
vim yarn-site.xml # 配置如下yarn.resourcemanager.hostname master yarn.nodemanager.aux-services mapreduce_shuffle yarn.log-aggregation-enable true
编辑slaves
,并添加slave主机
创建masters
文件,指定master主机
传输配置文件给其他主机
scp -r ./* hadoop@slave0:/opt/hadoop-2.6.4/etc/hadoop/* scp -r ./* hadoop@slave1:/opt/hadoop-2.6.4/etc/hadoop/*
格式化分布式文件
hdfs namenode -format
启动集群
sbin/start-dfs.sh sbin/start-yarn.sh
访问官方文档
wget 'https://dev.mysql.com/get/mysql57-community-release-el7-11.noarch.rpm' # 清华源 https://mirrors6.tuna.tsinghua.edu.cn/mysql/yum/mysql57-community-el6/ sudo rpm -Uvh mysql57-community-release-el7-11.noarch.rpm yum repolist all | grep mysql # 安装最新版 sudo yum install mysql-community-server # 启动mysql sudo service mysqld start # sudo systemctl start mysqld #CentOS 7 # 查看启动状态 sudo systemctl status mysqld # 查看默认生成的密码 sudo grep 'temporary password' /var/log/mysqld.log mysql -uroot -p #输入查看到的密码 # 修改密码 mysql> ALTER USER 'root'@'localhost' IDENTIFIED BY 'MyNewPass4!';
修改mysql密码安全策略
mysql> show variables like 'validate_password%'; mysql> set global validate_password_policy=0; mysql> set global validate_password_length=1; # 再修改密码 mysql> set password for root@localhost = password('1234');
复制mysql-connector-java.jar
到/usr/local/hive/lib
目录下
下载地址:https://mirrors.tuna.tsinghua.edu.cn/apache/hive/
tar -zxvf apache-hive-2.3.5-bin.tar.gz sudo mv apache-hive-2.3.5-bin /usr/local/hive # 配置bashrc export HIVE_HOME=/usr/local/hive export PATH=$PATH:$HIVE_HOME/bin export CLASSPATH=$CLASSPATH:/usr/local/Hadoop/lib/*:. export CLASSPATH=$CLASSPATH:/usr/local/hive/lib/*:. # 使配置生效 source ~/.bashrc
修改/usr/local/hive/conf
下的hive-site.xml
,将hive原来自带的derby改成mysql
cd /usr/local/hive/conf mv hive-default.xml.template hive-default.xml vim hive-site.xml # 配置如下javax.jdo.option.ConnectionURL jdbc:mysql://localhost:3306/hive?createDatabaseIfNotExist=true JDBC connect string for a JDBC metastore javax.jdo.option.ConnectionDriverName com.mysql.jdbc.Driver Driver class name for a JDBC metastore javax.jdo.option.ConnectionUserName hive username to use against metastore database javax.jdo.option.ConnectionPassword hive password to use against metastore database
启动mysql数据库,新建hive数据库
mysql -u root -p # 新建一个Hive数据库,这个Hive数据库与hive-site.xml文件中//localhost:3306/hive的hive对应,用来保存hive元数据信息 create database hive;
配置mysql允许hive接入
# 将所有数据库的所有表的所有权限都赋给hive用户,后面的hive适配hive-site.xml中配置的连接密码,设置'hive'密码可能需要修改安全策略 grant all on *.* to hive@localhost identified by 'hive'; # 刷新mysql系统权限关系表 flush privileges;
伪分布式方式启动,修改slaves
:启动Hive前,启动hadoop集群
hive
问题:hive2需要初始化元数据
在hive的配置文件hive-site.xml
添加如下配置:
datanucleus.schema.autoCreateAll
true
在mysql删除原来的hive,
drop database hive;
初始化元数据
schematool -initSchema -dbType mysql
重启hive
hive常用HiveQL操作
create database if not exists hive;
show databases;
show databases like 'h.*';
use hive;
drop database if exists hive;
drop database if exists hive cascade;
# 导入requests包 import requests
page_id=910 paged=1 payload = {'paged_id':page_id,'paged':paged} # r=requests.get('https://blog.poryoung.cn/?page_id='+str(page_id)+'&paged='+str(paged)) # 传递参数 r=requests.get('https://blog.poryoung.cn',params=payload) # print(r.text)
# 响应状态 # r.raise_for_status() # r.status_code
# 修改编码 r.encoding='utf-8' print(r.encoding)
# 二进制响应 # r.content from PIL import Image from io import BytesIO i = Image.open(BytesIO(r.content))
# Json响应 # r.json()
# 原始响应内容 r=requests.get('https://blog.poryoung.cn',params=payload,strem=True) r.raw
# 定制请求头 # headers = {'user-agent': 'my-app/0.0.1'} # r = requests.get(url, headers=headers)
# post # r = requests.post(url, data=payload) # r = requests.post(url, data=json.dumps(payload))
# Cookies # r = requests.get(url) # r.cookies['example_cookie_name'] # 发送cookies # cookies = dict(cookies_are='working') # r = requests.get(url, cookies=cookies) # Cookie 的返回对象为 [RequestsCookieJar](https://2.python-requests.org//zh_CN/latest/api.html#requests.cookies.RequestsCookieJar)
# 超时 # requests.get('http://github.com', timeout=0.001)
# Session # [会话对象](https://2.python-requests.org//zh_CN/latest/user/advanced.html#session-objects)
# 代理 proxies = { "http": "http://10.10.1.10:3128", "https": "http://10.10.1.10:1080", } # requests.get("http://example.org", proxies=proxies)
import requests from bs4 import BeautifulSoup def spider_xiaohuar_content(url, headers): response = requests.get(url=url, headers=headers) print(response.status_code) if response.status_code == 200: response.encoding = 'utf-8' html = response.content # 参数:网页内容,解析器 soup = BeautifulSoup(html, 'html5lib') div_list = soup.find_all('div', attrs={'class': 'all_lanmu'}) text = '' file = open('爬虫校花.md', 'w', encoding='utf-8') for div in div_list: title_div = div.find('div', attrs={'class': 'title1000'}) title = title_div.find('a').string text += '\n\n## 标题:'+title+'\n\n' ul = div.find('ul') li_list = ul.find_all('li') for li in li_list: img_src = li.find('img').attrs['lazysrc'] a_href = li.find('a').attrs['href'] img_title = li.find('span').string school = li.find('b', attrs={'class': 'b1'}).string fav = li.find('b', attrs={'class': 'b2'}).string if url not in img_src: img_src = url+img_src text += '> ' + img_title+'\n\n' text += ''+'\n\n' text += '- 学校:'+school+'\n\n' text += '- 点赞人数:'+fav+'\n\n' file.write(text) file.close url = 'http://xiaohuar.com/' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'} spider_xiaohuar_content(url, headers)
代码及爬取结果地址
spider-xiaohuar.com.py
校花爬取.md
conda create -n py3 python=3 # 创建一个python3的环境,名为py3 source activate py3 # 激活py3环境 conda install ipykernel # 安装ipykernel模块 python -m ipykernel install --user --name py3 --display-name "py3" # 进行配置 jupyter notebook # 启动jupyter notebook,然后在"新建"中就会有py3这个kernel了
修改json配置文件
"python.pythonPath": "D:\\software\\anaconda\\envs\\bigData\\python.exe", "python.autoComplete.extraPaths": [ "D:\\software\\anaconda\\envs\\bigData", "D:\\software\\anaconda\\envs\\bigData\\Lib\\site-packages" ],
import requests from bs4 import BeautifulSoup import random import hashlib import time import json from string import Template runTimes = 10 waitTime = 5 jobDict = {} dataTemplate = '> 工作名称\n\n`${jobName}`\n\n> 公司\n\n[${companyName}](${companyUrl})\n\n\n\n`${companyType}`\n\n> 城市\n\n`${city}`\n\n> 学历要求\n\n`${eduLevel}`\n\n> 薪资\n\n`${salary}`\n\n> 福利\n\n${welfare}\n\n---\n\n' def spider_zhilian_content(url, headers, params): response = requests.get(url, headers=headers, params=params) print(response.status_code) if response.status_code == 200: response.encoding = 'utf-8' respJson = response.json() # data = json.loads(respJson['data']) data = respJson['data']['results'] content = '' for item in data: jobDict[item['jobName']] = item['number'] welfare = '' for w in item['welfare']: welfare += '`'+w+'` ' new_data = Template(dataTemplate).substitute(jobName=item['jobName'], jobNumber=item['number'], companyName=item['company']['name'], companyUrl=item['company']['url'], companyLogo=item[ 'companyLogo'], companyType=item['company']['type']['name'], city=item['city']['display'], eduLevel=item['eduLevel']['name'], salary=item['salary'], welfare=welfare) content += new_data return content # 随机生产请求参数 def params_generator(times): _v = str(random.random())[0:10] hl = hashlib.md5() hl.update(_v.encode(encoding='utf-8')) cid = str(hl.hexdigest()) cid = cid[0:8]+'-'+cid[8:12]+'-'+cid[12:16]+'-'+cid[16:20]+'-'+cid[20:32] hl.update(cid.encode(encoding='utf-8')) prid = str(hl.hexdigest())+_v[2:7] start = times*90 return { "start": start, "pageSize": 90, "cityId": 489, "salary": "0,0", "workExperience": -1, "education": -1, "companyType": -1, "employmentType": -1, "jobWelfareTag": -1, "kw": "大数据", "kt": 3, "_v": _v, "x-zp-page-request-id": prid, "x-zp-client-id": cid } headers = { "Accept": "application/json, text/plain, */*", "Referer": "https://sou.zhaopin.com/?jl=489&sf=0&st=0&kw=%E5%A4%A7%E6%95%B0%E6%8D%AE&kt=3", "Origin": "https://sou.zhaopin.com", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } url = 'https://fe-api.zhaopin.com/c/i/sou' times = 0 spider_time_start = time.time() # if __name__ == 'main': f = open('zhilian.md', 'a+', encoding='utf-8') totalContent = '## 数据列表\n\n' while(times < runTimes): params = params_generator(times) content = spider_zhilian_content(url, headers, params) totalContent += content time.sleep(waitTime) times += 1 spider_time_end = time.time() spider_count_time = spider_time_end-spider_time_start-runTimes*waitTime imgStyle = '\n\n' jobIndex = '> 爬取关键字:`大数据`\n> 爬取数据量:`' + \ str(len(jobDict))+'`\n去除等待时间('+str(runTimes*waitTime) + \ 's)爬取时间:`'+str(spider_count_time)+'`\n\n## 目录\n\n' for k, v in jobDict.items(): jobIndex += '- ['+k+'](#'+v+')\n' jobIndex += '\n---\n\n' f.write(imgStyle+jobIndex+totalContent) f.close()
创建类
class ZhiLian: def __init__(self, *args, **kwargs): self.headers = { "Accept": "application/json, text/plain, */*", "Referer": "https://sou.zhaopin.com/?jl=489&sf=0&st=0&kw=%E5%A4%A7%E6%95%B0%E6%8D%AE&kt=3", "Origin": "https://sou.zhaopin.com", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } self.url = 'https://fe-api.zhaopin.com/c/i/sou' self.f = open('spider-zhilian.csv', 'w', encoding='utf-8') self.out = _csv.writer(self.f) self.out.writerow(['工作ID', '工作名', '公司名', '公司Url', '公司Logo', '公司类型', '城市', '学历要求', '薪资', '福利', '招聘人数', '工作亮点', '职位描述', '技能要求']) _v = str(random.random())[0:10] hl = hashlib.md5() hl.update(_v.encode(encoding='utf-8')) cid = str(hl.hexdigest()) cid = cid[0:8]+'-'+cid[8:12]+'-' + \ cid[12:16]+'-'+cid[16:20]+'-'+cid[20:32] hl.update(cid.encode(encoding='utf-8')) prid = str(hl.hexdigest())+_v[2:7] self._v = _v self.cid = cid self.prid = prid
为类添加生成请求参数方法
# 产生请求参数 def params_generator(self, times, keyword): start = times*90 return { "start": start, "pageSize": 90, "cityId": 489, "salary": "0,0", "workExperience": -1, "education": -1, "companyType": -1, "employmentType": -1, "jobWelfareTag": -1, "kw": keyword, "kt": 3, "_v": self._v, "x-zp-page-request-id": self.prid, "x-zp-client-id": self.cid }
前10项数据可以直接解析智联API接口返回的json
数据
# 发送请求 def getJsonData(self, params): response = requests.get(self.url, headers=self.headers, params=params) if response.status_code == 200: response = response.json() return response['data']['results'] return False
# 解析返回数据 def parseData(self, jsonData): for item in jsonData: # try: list = [] jobNumber = item['number'] jobName = item['jobName'] companyName = item['company']['name'] companyUrl = item['company']['url'] companyLogo = item['companyLogo'] companyType = item['company']['type']['name'] city = item['city']['display'] eduLevel = item['eduLevel']['name'] salary = item['salary'] welfare = item['welfare'] # 获取后四个数据 (recruit, highlight, describe, skill) = self.getDetailData( item['positionURL']) list.extend([jobNumber, jobName, companyName, companyUrl, companyLogo, companyType, city, eduLevel, salary, welfare, recruit, highlight, describe, skill]) self.out.writerow(list) # except: # print('function:parseData Error!')
后四个数据需要爬取页面,该页面有时需要验证,可以直接爬取,检查爬取的数据是否为空,然后保存此时解析的html
代码,在本地环境运行,方便后续检查验证机制
def getDetailData(self, url): response = requests.get(url) if response.status_code == 200: response.encoding = 'utf-8' d = pq(response.text) """ print(len(d('#nocaptcha .nc_iconfont.btn_slide'))) # 检查是否需要验证 if len(d('#nocaptcha .nc_iconfont.btn_slide')) > 0: zlms = ZhilianMS(url) flag, html = zlms.start_simulate() zlms.close() print('?') if flag == True: d = pq(html) else: # 验证失败 return '', '', '', '' """ # 招收人数 recruit = d( '.summary-plane .summary-plane__info li:last').text()[1:-1] # 检查爬取的内容是否为空(有时可能是页面无数据,而不是需要验证,此时使用上面的方法) if len(recruit) == 0: # 如果需要验证,输出该验证页面的源码为imgCaptcha.htm,方便使用浏览器检查元素 # ff = open('imgCaptcha.htm', 'r+', encoding='utf-8') # if len(ff.read()) == 0: # ff.write(response.text) # ff.close() ############ # 破解智联验证,不需要验证时注释这段代码 zlms = ZhilianMS(url) flag, html = zlms.start_simulate(time=0.1) zlms.close() if flag == True: d = pq(html) else: # 验证失败 return '', '', '', '' ############ highlightList = d('.a-center-layout__content .highlights__content span') highlight = [] for i in highlightList.items(): highlight.append(i.text()) # 职位描述 describe = d('.describtion .describtion__detail-content').text().strip('\n') # 技能要求 skill = d('.description .describtion__skills-content').text().split(' ') return recruit, highlight, describe, skill
使用getDetailData
方法保存智联验证页面到本地为imgCaptcha.htm
,使用NodeJS搭建本地HTTPS
服务器
NodeJS搭建本地HTTPS
方法可以参考我之前的MQTT + NodeJS + Weixin Small-Program文章中的NodeJS搭建本地HTTPS服务器
部分
签证完成后启动简单的HTTPS
服务器
const https = require('https') const fs = require('fs') const options = { key: fs.readFileSync('your_ssl_addres/server.key'), cert: fs.readFileSync('your_ssl_addres/server.crt') }; https.createServer(options, function (req, res) { setTimeout(() => { let html = fs.readFileSync('./imgCaptcha.htm') res.end(html); }, 1000) }).listen(9008);
现在可以检查验证页面了,这是后面破解验证的基础
记录手动滑动轨迹
首先记录鼠标移动轨迹,便于之后分析手动滑动的特征,可以使用软件,如Mouse Controller
,参考文章使用 Python + Selenium 破解滑块验证码第四部分,但我使用发现记录轨迹十分诡异,决定使用javascript
写个简单的记录方法
let mFlag, sTime, eTime, sClientX, eClientX let time = [] let track = [] btn = document.querySelector('.nc_iconfont.btn_slide') function mouseDown(e) { sTime = new Date().getTime() mFlag = true sClientX = e.clientX time.push(0) track.push(0) } function mouseUp(e) { eTime = new Date().getTime() mFlag = false eClientX = e.clientX time.push(eTime - sTime) track.push(eClientX - sClientX) console.log(time.toString()) console.log(track.toString()) var blob = new Blob([time.toString() + '\n\n' + track.toString()], { type: 'text/plain' }) a = document.createElement('a') a.download = 'recorder.txt' a.href = window.URL.createObjectURL(blob) a.dataset.downloadurl = ['text/plain', a.download, a.href].join(':') me = document.createEvent('MouseEvents') me.initMouseEvent('click', true, false, window, 0, 0, 0, 0, 0, false, false, false, false, 0, null) a.dispatchEvent(me) time = [] track = [] } function mouseMove(e) { if (mFlag) { time.push(new Date().getTime() - sTime) track.push(e.clientX - sClientX) } } btn.addEventListener('mousedown', mouseDown, null) btn.addEventListener('mouseup', mouseUp, null) btn.addEventListener('mousemove', mouseMove)
我的记录结果如下:
时间:[0, 7, 23, 32, 39, 43, 56, 63, 71, 80, 88, 96, 103, 111, 119, 128, 136, 147, 151, 159, 167, 171, 180, 187, 196, 203, 211, 219, 227, 235, 243, 251, 263, 271, 279, 287, 295, 299, 308, 320, 327, 335, 343, 351, 360, 368, 375, 383, 391, 400, 407, 415, 424, 428, 435, 444, 451,460, 467, 475, 483, 492, 499, 510, 516, 524, 533, 638, 646, 732]
x轴相对偏移:[0, 1, 3, 4, 5, 8, 8, 10, 11, 13, 16, 17, 20, 23, 27, 30, 32, 34, 36, 38, 40, 41, 44, 47, 49, 52, 54, 57, 59, 63, 66, 69, 72, 76, 78, 82, 88, 93, 99, 103, 104, 108, 112, 117, 123, 128, 133, 136, 140, 145, 152, 158, 165, 173, 181, 189, 196, 204, 211, 217, 224, 231, 237, 243, 248, 254, 258, 264, 265, 265]
感兴趣的可以写个自动机制,之后多次模拟以提高准确性
分析手动滑动曲线
画出曲线如下:
可以使用numpy
的曲线拟合工具进行拟合,发现3次多项式拟合较为合适
得到曲线方程后就可以去使用selenium
模拟滑动了
# 文件名mouse_track_parser.py # 曲线拟合代码 import pylab import numpy as np from scipy.optimize import curve_fit class mouseTrackParser: time = [] track = [] def __init__(self, time, track): self.time = time self.track = track def myPolyfit(self, xx, exp): # 用多项式拟合 fit3 = np.polyfit(self.time, self.track, exp) formula3 = np.poly1d(fit3) yy = np.zeros(len(xx)) for idx, x in enumerate(xx): li = np.zeros(exp+1) for i in range(0, exp+1): li[i] = fit3[exp-i]*x**i yy[idx] = np.sum(li) return yy, fit3 def expFit(self, xx): def func(x, a, b): return a*np.exp(b/x) popt, pcov = curve_fit(func, self.time, self.track) # popt里面是拟合系数,读者可以自己help其用法 a = popt[0] b = popt[1] return func(xx, a, b) if __name__ == '__main__': timeData = [0, 7, 23, 32, 39, 43, 56, 63, 71, 80, 88, 96, 103, 111, 119, 128, 136, 147, 151, 159, 167, 171, 180, 187, 196, 203, 211, 219, 227, 235, 243, 251, 263, 271, 279, 287, 295, 299, 308, 320, 327, 335, 343, 351, 360, 368, 375, 383, 391, 400, 407, 415, 424, 428, 435, 444, 451, 460, 467, 475, 483, 492, 499, 510, 516, 524, 533, 638, 646, 732] trackData = [0, 1, 3, 4, 5, 8, 8, 10, 11, 13, 16, 17, 20, 23, 27, 30, 32, 34, 36, 38, 40, 41, 44, 47, 49, 52, 54, 57, 59, 63, 66, 69, 72, 76, 78, 82, 88, 93, 99, 103, 104, 108, 112, 117, 123, 128, 133, 136, 140, 145, 152, 158, 165, 173, 181, 189, 196, 204, 211, 217, 224, 231, 237, 243, 248, 254, 258, 264, 265, 265] mouseTrackParser = mouseTrackParser(timeData, trackData) pylab.plot(timeData, trackData, '.') pylab.plot(timeData, trackData, '-') xx = np.arange(0, 1000) yy, fits = mouseTrackParser.myPolyfit(xx, 3) # yy = expFit(xx, time, track) pylab.plot(xx, yy, 'r') pylab.show()
selenium模拟人手动滑动
下载与Chrome浏览器对应的chromedreiver,解压到Chrome安装目录Google\Chrome\Application
,配置环境变量
创建ZhilianMS
类
# 文件名zhilian_ms.py # 导入的包 from selenium import webdriver from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.action_chains import ActionChains from time import sleep import numpy as np import pylab from mouse_track_parser import mouseTrackParser # 默认timeData和trackData # timeData = [], trackData = [] class ZhilianMS: def __init__(self, url, chromepath='C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe', timeData=timeData, trackData=trackData): self.driver = webdriver.Chrome(chromepath) self.timeData = timeData self.trackData = trackData self.url = url self.action = ActionChains(self.driver) # 生成鼠标运动轨迹方法 # offset:滑动距离,time:滑动时间 def get_trace(self, offset, time): timeRatio = time*1000/np.max(self.timeData) trackRatio = offset/np.max(self.trackData) msTime = np.array(self.timeData)*timeRatio msTrack = np.array(self.trackData)*trackRatio xx = np.arange(0, time*1000) yy, fits = mouseTrackParser( msTime, msTrack).myPolyfit(xx, 3) yy = np.abs(np.floor(offset/np.max(yy)*yy)) # pylab.plot(timeData, trackData, '.') # pylab.plot(msTime, msTrack, '-') # pylab.plot(xx, yy, '.') # pylab.show() return xx, yy # 网上给出的参考方法 """ def get_trace(self, distance): ''' :param distance: (Int)缺口离滑块的距离 :return: (List)移动轨迹 ''' # 创建存放轨迹信息的列表 trace = [] # 设置加速的距离 faster_distance = distance*(4/5) # 设置初始位置、初始速度、时间间隔 start, v0, t = 0, 0, 0.2 # 当尚未移动到终点时 while start < distance: # 如果处于加速阶段 if start < faster_distance: # 设置加速度为2 a = 1.5 # 如果处于减速阶段 else: # 设置加速度为-3 a = -3 # 移动的距离公式 move = v0 * t + 1 / 2 * a * t * t # 此刻速度 v = v0 + a * t # 重置初速度 v0 = v # 重置起点 start += move # 将移动的距离加入轨迹列表 trace.append(round(move)) # 返回轨迹信息 return trace """ # 封装判断元素是否存在方法 def isElemExist(self, cssSelector): flag = True try: self.driver.find_element_by_css_selector(cssSelector) return flag except: flag = False return flag # 判断是否需要验证 def if_need_verify(self, sliderSelector='#nocaptcha .nc_iconfont.btn_slide', wrapperSelector='.nc-container .nc_scale',): try: # 滑块 slide = self.driver.find_element_by_css_selector(sliderSelector) # 滑块包裹 nc_scale = self.driver.find_element_by_css_selector( wrapperSelector) except: print('未找到元素') return False return True # 模拟滑动方法 def drag_and_drop(self, ele, offset, time): xx, yy = self.get_trace(offset, time) self.action = ActionChains(self.driver) self.action.click_and_hold(ele).perform() for i in range(0, len(yy)): if self.if_need_verify() == False: break try: self.action.move_by_offset(yy[i], 0).perform() # self.action.reset_actions() except: break # ActionChains(driver).drag_and_drop_by_offset(ele, y, 0).perform() self.action.release().perform() sleep(3) warnElem = self.isElemExist('.nc-container .errloading') if warnElem == True: warnElem = self.driver.find_element_by_css_selector( '.nc-container .errloading') refreshBtn = warnElem.find_element_by_tag_name('a') refreshBtn.click() else: return True # 开始模拟 def start_simulate(self, sliderSelector='#nocaptcha .nc_iconfont.btn_slide', wrapperSelector='.nc-container .nc_scale', time=1, timeOut=3): self.driver.maximize_window() self.driver.get(self.url) self.driver.implicitly_wait(1) tryTime = 0 while self.if_need_verify(sliderSelector, wrapperSelector) and tryTime < timeOut: tryTime += 1 # 滑块 slide = self.driver.find_element_by_css_selector(sliderSelector) # 滑块包裹 nc_scale = self.driver.find_element_by_css_selector( wrapperSelector) slide_width = int(slide.value_of_css_property('width')[0:-2]) nc_scale_width = int(nc_scale.value_of_css_property('width')[0:-2]) print(slide_width, nc_scale_width) self.drag_and_drop(slide, nc_scale_width-slide_width, time) sleep(1) if tryTime < timeOut: return True, self.driver.find_element_by_xpath( "//*").get_attribute("outerHTML") return False, None # 结束并退出 def close(self): self.driver.quit() if __name__ == '__main__': zlms = ZhilianMS('https://127.0.0.1:9008') flag, html = zlms.start_simulate() print(flag)
获取后四项数据
现在可以取消getDetailData
方法中获取后四项数据代码的注释了
实例化ZhiLian
对象,对关键词大数据
和python
都获取十页数据
zl = ZhiLian() for times in range(0, 10): data = zl.getJsonData(zl.params_generator(times, '大数据')) if(data == False): pass else: zl.parseData(data) for times in range(0, 10): data = zl.getJsonData(zl.params_generator(times,'python')) if(data == False): pass else: zl.parseData(data) zl.fclose()
selenium还是拟合函数的问题?
停止分布式集群,修改/opt/hadoop-2.6.4/etc/hadoop/slaves
文件,启动伪分布式系统
cd /opt/hadoop-2.6.4 ./sbin/stop-yarn.sh ./sbin/stop-dfs.sh vim etc/hadoop/slaves ## 注释掉slave节点,添加localhost节点 # slave0 # slave1 # slave2 localhost ## 重新启动伪分布式集群 ./sbin/start-dfs.sh ./sbin/start-yarn.sh
./bin/hdfs dfs -mkdir /input ./bin/hdfs dfs -put data.csv /input/
MapReduce程序执行流程
可以使用eclipse
或者IntelliJ Idea
,这里创建maven
项目,并使用vscode
开发
准备工作:
hadoop-common-2.2.0-bin-master.zip
,配置环境变量,新建变量名HADOOP_HOME
,值为解压地址path
,%HADOOP_HOME%/bin
test
并上传至dfs
的input
目录Hello Java
Hello Python
Hello Hadoop
Hello JavaScript
Hello C++
Hello C#
Hello World
Hello World
Hello Java
Hello Python
Hello Hadoop
Hello Java
Hello Python
Hello Hadoop
HHello Hadoop
Hello JavaScript
Hello C++
Hello C#
Hello World
Hello World
Hello Java
Hello Hadoop
Hello JavaScript
Hello C++
./bin/hdfs dfs -mkdir /input ./bin/hdfs dfs -put test /input/
pom.xml
配置如下:
4.0.0 cn.itcast.hadoop.mr datacount 0.0.1-SNAPSHOT 1.8 1.8 org.apache.hadoop hadoop-common 2.2.0 org.apache.hadoop hadoop-client 2.2.0
新建WordCountMapper.java
package com.poryoung.mapreduce; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; public class WordCountMapper extends Mapper{ @Override /** * @Author: Administrator * @Description: TODO * @Date: 11:49 2019/7/8 * @param key * @param value * @param context * @return: void * @throw */ protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); String[] words = line.split(" "); for (int i = 0; i < words.length; i++) { context.write(new Text(words[i]), new IntWritable(1)); } } }
新建WordCountReducer.java
package com.poryoung.mapreduce; import java.io.IOException; import org.apache.hadoop.io.*; import org.apache.hadoop.mapreduce.Reducer; public class WordCountReducer extends Reducer{ @Override protected void reduce(Text key, Iterable ite, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable i : ite) { sum += i.get(); } context.write(key, new IntWritable(sum)); } }
新建WordCountDriver.java
package com.poryoung.mapreduce; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.*; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class WordCountDreiver { public static void main(String[] args) throws Exception { Configuration configuration = new Configuration(); // 1.创建一个job和任务入口 Job job = Job.getInstance(configuration); job.setJarByClass(WordCountDreiver.class); // main方法所在的class // 2.指定job的mapper和输出的类型job.setMapperClass(WordCountMapper.class);// 指定Mapper类 job.setMapOutputKeyClass(Text.class); // k2的类型 job.setMapOutputValueClass(IntWritable.class); // v2的类型 // 3.指定job的reducer和输出的类型 job.setReducerClass(WordCountReducer.class);// 指定Reducer类 job.setOutputKeyClass(Text.class); // k4的类型 job.setOutputValueClass(IntWritable.class); // v4的类型 // 4.指定job的输入和输出 FileInputFormat.setInputPaths(job, new Path("hdfs://master:9000/input/test")); FileOutputFormat.setOutputPath(job, new Path("hdfs://master:9000/output/")); // 5.执行job job.waitForCompletion(true); System.out.print(job.waitForCompletion(true) ? '0' : '1'); } }
执行WordCountDreiver
主函数,观察结果
C# 3
C++ 3
HHello 1
Hadoop 5
Hello 27
Java 5
JavaScript 3
Python 3
World 6
./bin/hdfs dfs -chown -R /
和mysql操作基本一致,Hive基本命令整理
使用第七天MapReduce
程序中用到的test
文件作为测试文件
启动伪分布式集群,启动yard
cd /opt/hadoop-2.6.4 ./sbin/start-dfs.sh ./sbin/start-yard.sh
启动hive
hive
创建一张表,字段为txt
,类型为String
,以\t
作为分隔
create table wc(txt String) row format delimited fields terminated by '\t';
载入test
文件数据到表中
load data local inpath '/opt/hadoop-2.6.4/input/test' overwrite into table wc;
对数据进行分词,以为分隔符
select split(txt,' ') from wc; select explode(split(txt,' ')) from wc;
统计各词出现的次数
select t1.word,count(t1.word) from (select explode(split(txt ,' ')) word from wc)t1 group by t1.word;
结果如下:
使用第六天爬取的zhilian.csv
智联招聘数据作为源数据,清洗过滤其空数据
将数据上传到nameNode
节点,然后上传到hdfs
文件系统
cd /opt/hadoop-2.6.4 ./bin/hdfs dfs -put ./input/zhilian.csv /input/
观察数据结构特点,编写CleanMapper
程序
package com.poryoung.mapreduce; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; import java.util.ArrayList; public class CleanMapper extends Mapper{ @Override protected void map(LongWritable key, Text value, Mapper .Context context) throws IOException, InterruptedException { // 不处理第一行抬头 if (!key.toString().equals("0")) { // 正则匹配非双引号内的逗号,使用匹配的逗号分割文本 String[] strList = value.toString().split(",(?=([^\"]*\"[^\"]*\")*[^\"]*$)"); ArrayList arrayList = new ArrayList (); for (int i = 0; i < strList.length; i++) { String str = strList[i].trim(); if (str.isEmpty() || str.equals("[]") || str.equals("[\"\"]")) { // 过滤空数据 return; } arrayList.add(strList[i]); } // 将逗号分隔转为使用`|`分隔 context.write(NullWritable.get(), new Text(String.join("|", arrayList))); } else { context.write(NullWritable.get(), value); } } }
编写ReduceMapper
程序
package com.poryoung.mapreduce; import java.io.IOException; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; public class ReduceMapper extends Reducer{ @Override protected void reduce(NullWritable arg0, Iterable values, Reducer .Context context) throws IOException, InterruptedException { int count = 0; for (Text text : values) { String[] dataList = text.toString().split("\\|"); if (count == 0) { count = dataList.length; } else if (dataList.length == count) { context.write(NullWritable.get(), new Text(String.join("|", dataList))); } } } }
编写主类,运行程序
package com.poryoung.mapreduce; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.*; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class Main { public static void main(String[] args) throws Exception { Configuration configuration = new Configuration(); // 1.创建一个job和任务入口 Job job = Job.getInstance(configuration); job.setJarByClass(Main.class); // main方法所在的class // 2.指定job的mapper和输出的类型job.setMapperClass(CleanMapper.class);// 指定Mapper类 job.setMapOutputKeyClass(NullWritable.class); // k2的类型 job.setMapOutputValueClass(Text.class); // v2的类型 // 3.指定job的reducer和输出的类型 job.setReducerClass(ReduceMapper.class);// 指定Reducer类 job.setOutputKeyClass(NullWritable.class); // k4的类型 job.setOutputValueClass(Text.class); // v4的类型 // 4.指定job的输入和输出 FileInputFormat.setInputPaths(job, new Path("hdfs://master:9000/input/zhilian.csv")); FileOutputFormat.setOutputPath(job, new Path("hdfs://master:9000/output/zhilian/")); new Path("hdfs://master:9000/").getFileSystem(configuration) .delete(new Path("hdfs://master:9000/output/zhilian/")); // 5.执行job job.waitForCompletion(true); System.out.print(job.waitForCompletion(true) ? '0' : '1'); } }
将清洗后的数据保存到hive
中task
数据库的cleandata
表
create database task; use task; # 根据csv表的结构创建数据表 create table cleandata(jid String,jname String,company String,companyUrl String,companyLogo String,companyType String,city String,edu String,salary String,welfare String,recruit int,hightlight String,desc String,skill String)ROW FORMAT DELIMITED FILEDS TERMINATED BY '|' STORED AS TEXTFILE; # 查看表结构 desc cleandata; # 将清洗的数据加载到表中 load data inpath '/output/zhilian/part*' into table cleandata; # 查看是否加载成功 select * from cleandata limit 5;
统计各岗位的招聘人数
创建jobRecruit
表,用来保存各岗位招聘人数信息
create table jobRecruit(jname String,recruit int);
统计各个岗位招聘数量,并写入到jobRecruit
表中
# jname为cleandata中爬取到的岗位名,recruit为爬取到的招聘人数 insert into table jobRecruit (select jname,sum(recruit) as recruit from cleandata group by jname); # 查看统计结果
统计岗位描述中出现的编程语言频次
创建key_count_dirty
表,用来保存初步统计的脏数据
create table key_count_dirty(key String,count int);
使用正则表达式[^a-zA-Z0-9\#\+\. ]
,提取关键词,由于数据中英文描述较少,忽略这一部分的关键词,将空格也加入到反匹配中
insert into table key_count_dirty (select t.key as key,count(*) as count from ((select explode(split(desc,'[^a-zA-Z0-9\#\+\. ]')) as key from cleandata)as t) group by t.key);
继续处理脏数据,新建表key_count_clean1
create table key_count_clean1(key String,count int); # 过滤纯数字,过滤薪资,+开头或者一个+结尾的key insert into table key_count_clean1 (select key,count from key_count_dirty where key not regexp '^\\d|^\\+|[a-zA-Z]{2,}\\+');
提取被过滤的部分带符号关键词,如+python
,java+
create table get_back(key String,count int); insert into table get_back select key,count from key_count_dirty where key regexp '^\\+[a-zA-z]|[a-zA-z]\\+(?!\\+)'; # 按+将关键词分开,将统计结果写入key_count_clean1 insert into table key_count_clean1 select t.key,count(*) from((select explode(split(key,'\\+')) as key from get_back) as t)group by t.key;
将所有key
转为小写,合并统计
create table key_count_clean2(key String,count int); insert into table key_count_clean2 select lower(key) as key,sum(count) as count from key_count_clean1 group by key;
查看统计结果
select * from key_count_clean2;
p.s insert overwrite可以覆盖原表,避免新建表