job脚本

#!/bin/bash

hadoop_binary=hadoop/yarn-client/bin/hadoop
hdfs_bin_dir=/home/search/bin
mapreduce_binary_name=extract_mergeddoc

shell_name="run_goral.sh"
declare -i exist=`ps x | grep -v "grep" | grep -c $shell_name`
echo $exist
if [[ $exist > 3 ]]; then
 echo "this script is running, stop this operate"
 exit
fi

mapred_run() {
data_input_path=$1
data_output_path=$2
cmd="/application/search/online/$mapreduce_binary_name
    --auto_run                          
    --num_mapper=1024
    --num_reducer=1
    --input_format=sstable
    --output_format=sstable
    --hdfs_bin_dir=$hdfs_bin_dir
    --hadoop_binary=$hadoop_binary
    --hdfs_input_paths=$data_input_path
    --hdfs_output_dir=$data_output_path
    --hdfs_host=hdfs:
    --hadoop_fs_thrift_server=localhost
    --find_sstable_recursive
    --release_yrdata_prefix=/opt/data/depending
    --custom_mapred_params=mapred.job.map.memory.mb:4096^o^user.email:xxx
"
echo $cmd
$cmd
$hadoop_binary fs -mkdir $data_output_path/Latest
}

kill_all_mapred() {
echo "stopping all "$mapreduce_binary_name
ID=`ps x | grep $mapreduce_binary_name | grep -v "grep" | awk '{print $1}'`
echo $ID
for id in $ID
  do
    kill $id
    echo "kill "$id
  done
}

#kill all mapreduce first
kill_all_mapred

i=0
while (($i < 64))
do
data_input_dir="/home/search/searcharch/production/indexing_pipeline/goral/base_doc/shard_$i"
data_output_dir_prefix="/home/search/searcharch/production/indexing_pipeline/low_quality/sensitive/shard_$i"
echo $data_input_dir
$hadoop_binary fs -ls $data_input_dir | grep $data_input_dir | awk -F" " '{print $NF}' | while read data_real_input_dir; do
test_dir=$data_real_input_dir"/Latest"
echo $test_dir
$hadoop_binary fs -test -e $test_dir
if [ $? -ne 0 ]; then
i=$(($i+1))
continue
fi

current_date=`date +%Y_%m_%d`
data_output_dir=$data_output_dir_prefix/$current_date-$i 
data_output_sign_dir=$data_output_dir"/Latest"

#判断是否曾经生成过数据
$hadoop_binary fs -test -e $data_output_dir_prefix
if [ $? -ne 0 ]; then
mapred_run $data_real_input_dir $data_output_dir

run_over=0
sleep_cnt=0
while [ $run_over -eq 0 ] 
do
sleep 60
sleep_cnt=$(($sleep_cnt+1))
$hadoop_binary fs -test -e $data_output_sign_dir
if [ $? -eq 0 ]; then
  run_over=1
fi
if [ $sleep_cnt -gt 60 ]; then
  kill_all_mapred
  exit 
fi
done

continue
fi

#获取shard最新更新时间
command_get_file_time=$hadoop_binary" fs -stat "$test_dir
input_update_time=`$command_get_file_time`
echo $input_update_time

#获取对应产出数据的时间
command_get_file_time=$hadoop_binary" fs -stat "$data_output_dir_prefix
output_update_time=`$command_get_file_time`
echo $output_update_time

#产出数据的更新时间大于shard的更新时间
if [ "$input_update_time" \< "$output_update_time" ]; then
continue
fi

#找到Latest目录并删除
$hadoop_binary fs -ls $data_output_dir_prefix | grep $data_output_dir_prefix | awk -F" " '{print $NF}' | while read output_dir; do
temp_dir=$output_dir"/Latest"
$hadoop_binary fs -test -e $temp_dir
if [ $? -eq 0 ]; then
$hadoop_binary fs -rm -r $temp_dir
fi
done

#重新生产数据
mapred_run $data_input_dir $data_output_dir

run_over=0
sleep_cnt=0
while [ $run_over -eq 0 ]
do
echo "sleep"
sleep 60
sleep_cnt=$(($sleep_cnt+1))
$hadoop_binary fs -test -e $data_output_sign_dir
if [ $? -eq 0 ]; then
  run_over=1
fi
if [ $sleep_cnt -gt 60 ]; then
  kill_all_mapred
  exit 
fi
done
done

i=$(($i+1))
done

 

你可能感兴趣的:(Linux)