(1)编写conf文件
在flume的conf目录下新建文件
a1.sources=r1
a1.channels=c1
a1.sinks=s1
a1.sources.r1.type=exec
a1.sources.r1.command=tail -F /opt/module/flume-1.9.0/conf/data/zh_all2.txt
a1.sources.r1.bind=0.0.0.0
a1.sources.r1.port=44444
a1.sinks.s1.type=HDFS
a1.sinks.s1.type=hdfs://hadoop129:90000/user/flume/qcwy_txt
a1.sinks.s1.hdfs.rollCount=0
a1.sinks.s1.hdfs.fileType=Datastream
# 配置a1的channel组件c1的属性
a1.channels.c1.type=memory
a1.channels.c1.capacity=1000
a1.channels.c1.transactionCapacity=100
# 为source和sink组件绑定channel
a1.sources.r1.channels=c1
a1.sinks.k1.channel=c1
1、利用hive进行分析,2、将hive分析结果利用sqoop技术存储到mysql数据库中,并最后显示分析结果
分析“数据分析”、“大数据开发工程师”、“数据采集”等岗位的平均工资、最高工资、最低工资,并作条形图将结果展示出来
A. 数据分析岗位
(1)模糊匹配提取
以模糊匹配提取出数据分析岗位的记录,存入表f_x_1(只存Jobtitle和wages字段)
(2)切分薪资字段存储
create table f_x_2 as select Jobtitle, regexp_extract(wages,'([0-9]+)-',1) as a_min, regexp_extract(wages,'-([0-9]+)',1) as a_max, (regexp_extract(wages,'([0-9]+)-',1) + regexp_extract(wages,'-([0-9]+)',1))/2 as a_avg from f_x_1;
create table f_x_3 as select "数据分析" as Jobtitle, min(int(a_min)*0.1) as s_min, max(int(a_max)*0.1) as s_max, regexp_extract(avg(a_avg),'([0-9]+.[0-9]?[0-9]?)',1)*0.1 as s_avg from f_x_2;
汇总
(4)、下面查询大数据、数据采集方法类似、然后汇总为一张总表
(1)在mysql创建数据库数据表
进入数据库:mysql -u root -p
创建qcwy_db数据库
使用qcwy_db数据库创建表
(1)创建表:create table tab1(t_name varchar(20), t_min int, t_max int, t_avg varchar(10)) charset utf8 collate utf8_general_ci;
(2)导入数据
bin/sqoop export --connect jdbc:mysql://hadoop129:3306/qcwy_db --username root --password 1 --table tab1 --export-dir /user/hive/warehouse/qcwy_db.db/tab1 --input-null-string "\\\\N" --input-null-non-string "\\\\N" --input-fields-terminated-by "\001" --input-lines-terminated-by "\\n" -m 1
创建远程访问mysql数据库用户
GRANT ALL PRIVILEGES ON *.* TO 'admin'@'%' IDENTIFIED BY '1' WITH GRANT OPTION;
import pymysql
from pyecharts.charts import Bar
from pyecharts import options as opts
class MysqlTool:
def __init__(self,host,user,password,database,port = 3306,charset = 'utf8'):
self.host = host
self.user = user
self.password = password
self.database = database
self.port = port
self.charset = charset
def connect(self):#连接数据库
self.conn = pymysql.connect(
host = self.host,
user = self.user,
password = self.password,
database = self.database,
port = self.port,
charset = self.charset
)
self.cursor = self.conn.cursor() #cursor获取游标
#增删改
#sql:要执行的sql语句
#args:带参sql的值
#返回受影响的行数
def __cud(self,sql,args = None):#私有
row_count = 0
try:
self.connect()
row_count = self.cursor.execute(sql,args)#execute执行
self.conn.commit()#commit提交
self.close()
except Exception as e:
print(e)
return row_count
#插入
def insert(self,sql,args):
return self.__cud(sql,args)
#修改
def updata(self,sql,args):
return self.__cud(sql,args)
#删除
def delete(self,sql,args):
return self.__cud(sql,args)
#查询一条信息
def get_one(self,sql,args=None):
try:
self.connect()
self.cursor.execute(sql,args)
result=self.cursor.fetchone()
self.close()
return result
except Exception as e:
print(e)
#查询多条信息
def get_all(self,sql,args=None):
try:
self.connect()
self.cursor.execute(sql,args)# 连接,获取光标,执行
# result=self.cursor.execute()
result=self.cursor.fetchall()#返回结果
self.close()
return result
except Exception as e:
print(e)
#关闭连接
def close(self):
self.cursor.close()
self.conn.close()
mt = MysqlTool('192.168.10.129', 'root', '1', 'hive')
def show_text():
sql = "select * from work_1"
result = mt.get_all(sql)
#得到职位名称
def show_name(list):
vv = []
for v in list:
name = ''
a = re.findall('[\u4e00-\u9fa5]', str(v))
for i in a:
name += i
vv.append(name)
#print(vv)
return vv
#
def show_bar_chart1(data1,cc):
ll = data1
# 创建3个空数组
average_Pay_level = []
max_Pay_level = []
min_Pay_level = []
#循环向数组添加数据
for i in ll:
data = pd.DataFrame(list(db.find(i)))
bb = data['wages'].values
max_Pay_level.append(Pay_level_list(bb)[0])
average_Pay_level.append(Pay_level_list(bb)[1])
min_Pay_level.append(Pay_level_list(bb)[2])
show(max_Pay_level, average_Pay_level, min_Pay_level, cc)
#data为工资列表
#统一格式后,输出最大,平均,最小
def Pay_level_list(data):
ww = [".*?千/月", ".*?万/月", ".*?万/年", ".*?元/天"]
Pay_level_list = []
for i in data:
if isinstance(i, str):
for j, v in enumerate(ww):
if re.search(v, i) is not None:
if j == 0:
num = [round(i, 2) for i in
([(i * 12 / 10) for i in (list(map(float, re.findall(r"\d+\.?\d*", i))))])]
elif j == 1:
num = [round(i, 2) for i in
([(i * 12) for i in (list(map(float, re.findall(r"\d+\.?\d*", i))))])]
elif j == 2:
num = [round(i, 2) for i in (list(map(float, re.findall(r"\d+\.?\d*", i))))]
elif j == 3:
num = [round(i, 2) for i in
([(i * 365 / 10000) for i in (list(map(float, re.findall(r"\d+\.?\d*", i))))])]
Pay_level_list.append(num_al(num))
return max(Pay_level_list), tall_num(Pay_level_list), min(Pay_level_list)
#求平均值
def tall_num(list):
num = 0
for i in list:
num += i
return round(num/(len(list)+1), 2)
def num_al(list):
if len(list) >= 2:
num = (list[0] + list[1]) / 2
else:
num = list[0]
return round(num, 2)
#输出条形图
def show(a, b, c, d):
name=d #d = x轴标题(abcd个数要对应)
y1 = a # a = 最高工资列表
y2 = b #b = 平均工资列表
y3 = c #c = 最低工资
x = pd.np.arange(len(name))
width = 0.25
plt.bar(x, y1, width=width, label='最高工资', color='red')
plt.bar(x + width, y2, width=width, label='平均工资', color='deepskyblue', tick_label=name)
plt.bar(x + 2 * width, y3, width=width, label='最低工资', color='green')
# 显示在图形上的值
for a, b in zip(x, y1):
plt.text(a, b + 0.1, b, ha='center', va='bottom')
for a, b in zip(x, y2):
plt.text(a + width, b + 0.1, b, ha='center', va='bottom')
for a, b in zip(x, y3):
plt.text(a + 2 * width, b + 0.1, b, ha='center', va='bottom')
plt.xticks()
plt.legend(loc="upper left") # 防止label和图像重合显示不出来
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.ylabel('月/K')
plt.xlabel('岗位名称')
plt.rcParams['savefig.dpi'] = 300 # 图片像素
plt.rcParams['figure.dpi'] = 300 # 分辨率
plt.rcParams['figure.figsize'] = (15.0, 8.0) # 尺寸
plt.title("工资分析")
plt.savefig('D:\\result.png')
plt.show()
import pymysql
from pyecharts.charts import Bar
from pyecharts import options as opts
class MysqlTool:
def __init__(self,host,user,password,database,port = 3306,charset = 'utf8'):
self.host = host
self.user = user
self.password = password
self.database = database
self.port = port
self.charset = charset
def connect(self):#连接数据库
self.conn = pymysql.connect(
host = self.host,
user = self.user,
password = self.password,
database = self.database,
port = self.port,
charset = self.charset
)
self.cursor = self.conn.cursor() #cursor获取游标
#增删改
#sql:要执行的sql语句
#args:带参sql的值
#返回受影响的行数
def __cud(self,sql,args = None):#私有
row_count = 0
try:
self.connect()
row_count = self.cursor.execute(sql,args)#execute执行
self.conn.commit()#commit提交
self.close()
except Exception as e:
print(e)
return row_count
#插入
def insert(self,sql,args):
return self.__cud(sql,args)
#修改
def updata(self,sql,args):
return self.__cud(sql,args)
#删除
def delete(self,sql,args):
return self.__cud(sql,args)
#查询一条信息
def get_one(self,sql,args=None):
try:
self.connect()
self.cursor.execute(sql,args)
result=self.cursor.fetchone()
self.close()
return result
except Exception as e:
print(e)
#查询多条信息
def get_all(self,sql,args=None):
try:
self.connect()
self.cursor.execute(sql,args)# 连接,获取光标,执行
# result=self.cursor.execute()
result=self.cursor.fetchall()#返回结果
self.close()
return result
except Exception as e:
print(e)
#关闭连接
def close(self):
self.cursor.close()
self.conn.close()
mt = MysqlTool('192.168.10.129', 'root', '1', 'hive')
def show_text():
sql = "select * from work_1"
result = mt.get_all(sql)
#得到职位名称
def show_name(list):
vv = []
for v in list:
name = ''
a = re.findall('[\u4e00-\u9fa5]', str(v))
for i in a:
name += i
vv.append(name)
return vv
#饼图实现
def pie_chart(list1):
city = list1
city1 = []
city2 = []
for i in city:
city1.append(i["recruiters"])#拿到公司名
mm = show_name(city1)
for j, v in enumerate(city):
bb = len(pd.DataFrame(list(db.find(v))))
city2.append(bb)
mm[j] += str(bb)
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
sizes = city2
# explode = (0.1, 0, 0, 0, 0)
plt.pie(sizes, labels=mm, autopct='%1.1f%%', shadow=False, startangle=150) # 想要突出
plt.title("饼图示例-岗位数")
plt.show()
import pymysql
from pyecharts.charts import Bar
from pyecharts import options as opts
class MysqlTool:
def __init__(self,host,user,password,database,port = 3306,charset = 'utf8'):
self.host = host
self.user = user
self.password = password
self.database = database
self.port = port
self.charset = charset
def connect(self):#连接数据库
self.conn = pymysql.connect(
host = self.host,
user = self.user,
password = self.password,
database = self.database,
port = self.port,
charset = self.charset
)
self.cursor = self.conn.cursor() #cursor获取游标
#增删改
#sql:要执行的sql语句
#args:带参sql的值
#返回受影响的行数
def __cud(self,sql,args = None):#私有
row_count = 0
try:
self.connect()
row_count = self.cursor.execute(sql,args)#execute执行
self.conn.commit()#commit提交
self.close()
except Exception as e:
print(e)
return row_count
#插入
def insert(self,sql,args):
return self.__cud(sql,args)
#修改
def updata(self,sql,args):
return self.__cud(sql,args)
#删除
def delete(self,sql,args):
return self.__cud(sql,args)
#查询一条信息
def get_one(self,sql,args=None):
try:
self.connect()
self.cursor.execute(sql,args)
result=self.cursor.fetchone()
self.close()
return result
except Exception as e:
print(e)
#查询多条信息
def get_all(self,sql,args=None):
try:
self.connect()
self.cursor.execute(sql,args)# 连接,获取光标,执行
# result=self.cursor.execute()
result=self.cursor.fetchall()#返回结果
self.close()
return result
except Exception as e:
print(e)
#关闭连接
def close(self):
self.cursor.close()
self.conn.close()
mt = MysqlTool('192.168.10.129', 'root', '1', 'hive')
def show_text():
sql = "select * from work_1"
result = mt.get_all(sql)
#data为工资列表
# 统一格式后,输出最大,平均,最小
def Pay_level_list(data):
ww = [".*?千/月", ".*?万/月", ".*?万/年", ".*?元/天"]
Pay_level_list = []
for i in data:
if isinstance(i, str):
for j, v in enumerate(ww):
if re.search(v, i) is not None:
if j == 0:
num = [round(i, 2) for i in
([(i * 12 / 10) for i in (list(map(float, re.findall(r"\d+\.?\d*", i))))])]
elif j == 1:
num = [round(i, 2) for i in
([(i * 12) for i in (list(map(float, re.findall(r"\d+\.?\d*", i))))])]
elif j == 2:
num = [round(i, 2) for i in (list(map(float, re.findall(r"\d+\.?\d*", i))))]
elif j == 3:
num = [round(i, 2) for i in
([(i * 365 / 10000) for i in (list(map(float, re.findall(r"\d+\.?\d*", i))))])]
Pay_level_list.append(num_al(num))
return max(Pay_level_list), tall_num(Pay_level_list), min(Pay_level_list)
#求平均数
def tall_num(list):
num = 0
for i in list:
num += i
return round(num/(len(list)+1), 2)
def num_al(list):
if len(list) >= 2:
num = (list[0] + list[1]) / 2
else:
num = list[0]
return round(num, 2)
#
def show_bar_chart1(xx,cc):
#拿到工资数据
ll = xx
#创建3个空数组
average_Pay_level = []
max_Pay_level = []
min_Pay_level = []
#循环向数组添加数据
for i in ll:
data = pd.DataFrame(list(db.find(i)))
bb = data['wages'].values
max_Pay_level.append(Pay_level_list(bb)[0])
average_Pay_level.append(Pay_level_list(bb)[1])
min_Pay_level.append(Pay_level_list(bb)[2])
show(max_Pay_level, average_Pay_level, min_Pay_level, cc)
def show(a, b, c, d):
name=d #d = x轴标题(abcd个数要对应)
y1 = a # a = 最高工资列表
y2 = b #b = 平均工资列表
y3 = c #c = 最低工资
x = pd.np.arange(len(name))
width = 0.25
plt.bar(x, y1, width=width, label='最高工资', color='red')
plt.bar(x + width, y2, width=width, label='平均工资', color='green', tick_label=name)
plt.bar(x + 2 * width, y3, width=width, label='最低工资', color='pink')
# 显示在图形上的值
for a, b in zip(x, y1):
plt.text(a, b + 0.1, b, ha='center', va='bottom')
for a, b in zip(x, y2):
plt.text(a + width, b + 0.1, b, ha='center', va='bottom')
for a, b in zip(x, y3):
plt.text(a + 2 * width, b + 0.1, b, ha='center', va='bottom')
plt.xticks()
plt.legend(loc="upper left") # 防止label和图像重合显示不出来
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.ylabel('月/K')
plt.xlabel('经验年限')
plt.rcParams['savefig.dpi'] = 300 # 图片像素
plt.rcParams['figure.dpi'] = 300 # 分辨率
plt.rcParams['figure.figsize'] = (15.0, 8.0) # 尺寸
plt.title("工作年限工资图")
plt.savefig('D:\\result.png')
plt.show()