hbase|Hbase数据导出实例


      1. Hbase数据导出实例
需求:
根据时间范围、区域等条件查询,将hbase中终端采集数据最大时间、最小时间的日志数据导出
参考文档:
http://blog.csdn.net/qq_27078095/article/details/56482010
https://www.cnblogs.com/szw-blog/p/5512349.html
http://blog.csdn.net/javajxz008/article/details/61173213
http://www.cnblogs.com/husky/p/6422001.html
https://my.oschina.net/wangjiankui/blog/497658
https://www.2cto.com/net/201708/673854.html
http://www.cnblogs.com/husky/p/6764317.html

解决办法:
  1. 通过hbase自带导入导出将查询到的终端mac数据导出到指定目录
hbase org.apache.hadoop.hbase.mapreduce.Driver export 表名称目录
这样导出是整个表数据,没法过滤,姑且暂时不能使用。
  1. 通过查询条件过滤导出
scan 'LOG20180108',{COLUMNS => 'INFO',LIMIT=>1,FILTER=>"(PrefixFilter('T')) AND (SingleColumnValueFilter('INFO','AreaCode',=,'binary:610103'))"}
导出到文件:
echo "scan 'LOG20180108',{COLUMNS => ['INFO'],LIMIT=>1,FILTER=>\"(PrefixFilter('T')) AND (SingleColumnValueFilter('INFO','AreaCode',=,'binary:610103'))\"}" | hbase shell > myText.csv

  1. 通过hive导出
  1. 将hbase表与hive临时表同步。
  2. 将hive临时表数据导入到真实表
  3. 将真实表数据导入数据库
脚本如下:
#!/bin/bash

#获取输入时间,默认为当前时间
get_time(){
if [ 3 -eq $# ]
then
date=`date -d "+0 day $1" +%Y%m%d`
enddate=`date -d "+1 day $2" +%Y%m%d`
elif [ 0 -eq $# ]
then
echo "无输入参数,则默认构建昨天."
echo "若是批量构建,请输入时间段,格式为【$0 yyyy-mm-dd yyyy-mm-dd】."
#read -p "若不输入参数则默认构建昨天数据,输入【y】继续构建昨日数据,输入【n】退出:" isBuild
#case $isBuild in
#y | Y)
date=`date -d "+0 day yesterday" +%Y%m%d`
enddate=`date +%Y%m%d`
#; ;
#n | N)
#exit
#; ;
#*)
#echo "输入错误,退出"
#exit
#; ;
#esac
else
【hbase|Hbase数据导出实例】echo "输入有误."
echo "若是批量构建,请输入时间段,格式为【$0 yyyy-mm-dd yyyy-mm-dd】."
echo "若默认构建昨天数据,则不需要输入参数,直接执行【$0】."
fi
}



#创建存储数据表结构
hive_table(){
echo "create hive table start......."
hive -S -e "DROP TABLE IF EXISTS LogTerminal;
CREATE TABLE LogTerminal(rowkey string,TerminalID string,TerminalMac string,DeviceType string, Power string,Channel string,MaxPower string,TimeNear string,LonNear string,LatNear string,RouteMac string,SSID string, SSIDs string,SecurityType string,RealType string, RealCode string,TerminalType int,PcBrand string, Phone string,IMSI string,IMEI string,OS string,CustomerStartTime string,OffLineTime string, Model string,CoordinateX string,CoordinateY string, OffLineLon string,OffLineLat string,PcIP string, RouteType string,SessionID string,ProtoType string,CyberCode string,IsMove string,IsElectric string, SafeState string,GPIOState string,Serial string, GuildID string,Time string,ManufacturerCode string,AreaCode string,UnitCode string,MachineCode string, SystemType string,DATASOURCEID string,Lon string, Lat string,LatLon string,RESOURCETYPE string, AccessSystemID string,InterfaceID string,InterfaceGroupID string,WriterTime string )COMMENT 'LogTerminal Table' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS PARQUET; "
hive -S -e "DROP TABLE IF EXISTS LogTerminal_min;
CREATE TABLE LogTerminal_min(rowkey string,TerminalID string,TerminalMac string,DeviceType string, Power string,Channel string,MaxPower string,TimeNear string,LonNear string,LatNear string,RouteMac string,SSID string, SSIDs string,SecurityType string,RealType string, RealCode string,TerminalType int,PcBrand string, Phone string,IMSI string,IMEI string,OS string,CustomerStartTime string,OffLineTime string, Model string,CoordinateX string,CoordinateY string, OffLineLon string,OffLineLat string,PcIP string, RouteType string,SessionID string,ProtoType string,CyberCode string,IsMove string,IsElectric string, SafeState string,GPIOState string,Serial string, GuildID string,Time string,ManufacturerCode string,AreaCode string,UnitCode string,MachineCode string, SystemType string,DATASOURCEID string,Lon string, Lat string,LatLon string,RESOURCETYPE string, AccessSystemID string,InterfaceID string,InterfaceGroupID string,WriterTime string )COMMENT 'LogTerminal Table' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS PARQUET; "
hive -S -e "DROP TABLE IF EXISTS LogTerminal_max;
CREATE TABLE LogTerminal_max(rowkey string,TerminalID string,TerminalMac string,DeviceType string, Power string,Channel string,MaxPower string,TimeNear string,LonNear string,LatNear string,RouteMac string,SSID string, SSIDs string,SecurityType string,RealType string, RealCode string,TerminalType int,PcBrand string, Phone string,IMSI string,IMEI string,OS string,CustomerStartTime string,OffLineTime string, Model string,CoordinateX string,CoordinateY string, OffLineLon string,OffLineLat string,PcIP string, RouteType string,SessionID string,ProtoType string,CyberCode string,IsMove string,IsElectric string, SafeState string,GPIOState string,Serial string, GuildID string,Time string,ManufacturerCode string,AreaCode string,UnitCode string,MachineCode string, SystemType string,DATASOURCEID string,Lon string, Lat string,LatLon string,RESOURCETYPE string, AccessSystemID string,InterfaceID string,InterfaceGroupID string,WriterTime string )COMMENT 'LogTerminal Table' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS PARQUET; "
echo "create hive table end......."
}

#创建临时终端日志表结构,并将hbase表和hive临时表关联,然后根据条件查询临时表数据插入到真实表中
hive_task(){
echo "hive task $1 $2 $3..."
DATA_FORMAT=`date -d "$1" +%Y-%m-%d`
TABLE_NAME=LOG$1
AREA_CODE=$3

echo $DATA_FORMAT
echo $TABLE_NAME
echo $AREA_CODE

hive -S -e "DROP TABLEIF EXISTS TempLogTerminal;
CREATE EXTERNAL TABLE TempLogTerminal(key string,TerminalID string,TerminalMac string,DeviceType string, Power string,Channel string,MaxPower string,TimeNear string,LonNear string,LatNear string,RouteMac string,SSID string, SSIDs string,SecurityType string,RealType string, RealCode string,TerminalType int,PcBrand string, Phone string,IMSI string,IMEI string,OS string,CustomerStartTime string,OffLineTime string, Model string,CoordinateX string,CoordinateY string, OffLineLon string,OffLineLat string,PcIP string, RouteType string,SessionID string,ProtoType string,CyberCode string,IsMove string,IsElectric string, SafeState string,GPIOState string,Serial string, GuildID string,Time string,ManufacturerCode string,AreaCode string,UnitCode string,MachineCode string,SystemType string,DATASOURCEID string,Lon string, Lat string,LatLon string,RESOURCETYPE string, AccessSystemID string,InterfaceID string,InterfaceGroupID string,WriterTime string)ROW FORMAT SERDE 'org.apache.hadoop.hive.hbase.HBaseSerDe' STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' WITH SERDEPROPERTIES('hbase.columns.mapping'=':key,INFO:TerminalID,INFO:TerminalMac,INFO:DeviceType,INFO:Power,INFO:Channel,INFO:MaxPower,INFO:TimeNear,INFO:LonNear,INFO:LatNear,INFO:RouteMac,INFO:SSID,INFO:SSIDs,INFO:SecurityType,INFO:RealType,INFO:RealCode,INFO:TerminalType,INFO:PcBrand,INFO:Phone,INFO:IMSI,INFO:IMEI,INFO:OS,INFO:CustomerStartTime,INFO:OffLineTime,INFO:Model,INFO:CoordinateX,INFO:CoordinateY,INFO:OffLineLon,INFO:OffLineLat,INFO:PcIP,INFO:RouteType,INFO:SessionID,INFO:ProtoType,INFO:CyberCode,INFO:IsMove,INFO:IsElectric,INFO:SafeState,INFO:GPIOState,INFO:Serial,INFO:GuildID,INFO:Time,INFO:ManufacturerCode,INFO:AreaCode,INFO:UnitCode,INFO:MachineCode,INFO:SystemType,INFO:DATASOURCEID,INFO:Lon,INFO:Lat,INFO:LatLon,INFO:RESOURCETYPE,INFO:AccessSystemID,INFO:InterfaceID,INFO:InterfaceGroupID,INFO:WriterTime') TBLPROPERTIES('hbase.table.name'='$TABLE_NAME') ;
INSERT $2 TABLE logterminal SELECT key as rowkey,TerminalID,TerminalMac,DeviceType,Power,Channel,MaxPower,TimeNear,LonNear,LatNear,RouteMac,SSID,SSIDs,SecurityType,RealType,RealCode,TerminalType,PcBrand, Phone,IMSI,IMEI,OS,CustomerStartTime,OffLineTime,Model,CoordinateX,CoordinateY,OffLineLon,OffLineLat,PcIP,RouteType,SessionID,ProtoType,CyberCode,IsMove,IsElectric, SafeState,GPIOState,Serial,GuildID,Time,ManufacturerCode,AreaCode,UnitCode,MachineCode,SystemType,DATASOURCEID,Lon,Lat,LatLon,RESOURCETYPE,AccessSystemID,InterfaceID,InterfaceGroupID,WriterTime
FROM TempLogTerminal WHERE RESOURCETYPE=32 ANDAreaCode='$AREA_CODE'; "

echo "hive task end......."
}

#创建sql表
sqlserver_table(){
"
if exists (select * from sysobjects where name='LogTerminal_min')
drop table LogTerminal_min

CREATETABLE LogTerminal_min(
rowkey nvarchar(200),TerminalID bigint,TerminalMac nvarchar(200),DeviceType bigint,
Power bigint,Channel bigint,MaxPower bigint,
TimeNear nvarchar(200),LonNear float,LatNear float,
RouteMac nvarchar(200),SSID nvarchar(200),
SSIDs nvarchar(200),SecurityType nvarchar(200),RealType bigint,
RealCode nvarchar(200),TerminalType bigint,PcBrand nvarchar(200),
Phone nvarchar(200),IMSI nvarchar(200),IMEI nvarchar(200),
OS nvarchar(200),CustomerStartTime nvarchar(200),OffLineTime nvarchar(200),
Model nvarchar(200),CoordinateX nvarchar(200),CoordinateY nvarchar(200),
OffLineLon float,OffLineLat float,PcIP nvarchar(200),
RouteType bigint,SessionID nvarchar(200),ProtoType bigint,
CyberCode nvarchar(200),IsMove bigint,IsElectric bigint,
SafeState bigint,GPIOState bigint,Serial nvarchar(200),
GuildID nvarchar(200),Time nvarchar(200),ManufacturerCode nvarchar(200),
AreaCode nvarchar(200),UnitCode nvarchar(200),MachineCode nvarchar(200),
SystemType nvarchar(200),DATASOURCEID bigint,Lon float,
Lat float,LatLon nvarchar(200),RESOURCETYPE bigint,
AccessSystemID bigint,InterfaceID bigint,InterfaceGroupID bigint,
WriterTime nvarchar(200)
);

if exists (select * from sysobjects where name='LogTerminal_max')
drop table LogTerminal_max

CREATETABLE LogTerminal_max(
rowkey nvarchar(200),TerminalID bigint,TerminalMac nvarchar(200),DeviceType bigint,
Power bigint,Channel bigint,MaxPower bigint,
TimeNear nvarchar(200),LonNear float,LatNear float,
RouteMac nvarchar(200),SSID nvarchar(200),
SSIDs nvarchar(200),SecurityType nvarchar(200),RealType bigint,
RealCode nvarchar(200),TerminalType bigint,PcBrand nvarchar(200),
Phone nvarchar(200),IMSI nvarchar(200),IMEI nvarchar(200),
OS nvarchar(200),CustomerStartTime nvarchar(200),OffLineTime nvarchar(200),
Model nvarchar(200),CoordinateX nvarchar(200),CoordinateY nvarchar(200),
OffLineLon float,OffLineLat float,PcIP nvarchar(200),
RouteType bigint,SessionID nvarchar(200),ProtoType bigint,
CyberCode nvarchar(200),IsMove bigint,IsElectric bigint,
SafeState bigint,GPIOState bigint,Serial nvarchar(200),
GuildID nvarchar(200),Time nvarchar(200),ManufacturerCode nvarchar(200),
AreaCode nvarchar(200),UnitCode nvarchar(200),MachineCode nvarchar(200),
SystemType nvarchar(200),DATASOURCEID bigint,Lon float,
Lat float,LatLon nvarchar(200),RESOURCETYPE bigint,
AccessSystemID bigint,InterfaceID bigint,InterfaceGroupID bigint,
WriterTime nvarchar(200)
);
"
}

#将数据导入到sqlserver中
import_sqlserver(){
echo " import sqlserver start......."
sqoop export -connect 'jdbc:sqlserver://192.168.2.219; username=nsmc53; password=123456; database=WFBDCMain'-table LogTerminal_min --hcatalog-database default --hcatalog-table LogTerminal_min --num-mappers 5;
sqoop export -connect 'jdbc:sqlserver://192.168.2.219; username=nsmc53; password=123456; database=WFBDCMain'-table LogTerminal_max --hcatalog-database default --hcatalog-table LogTerminal_max --num-mappers 5;

echo " import sqlserver end......."
}

#将hive数据导出到本地目录
export_hive_local(){
echo " export hive to local start......."
mkdir -p /home/hive/min
mkdir -p /home/hive/max
hive -S -e "\
insert overwrite local directory '/home/hive/min'ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' select a.* from LogTerminal a inner join(select TerminalMac,min(time) time from LogTerminal group by TerminalMac) b on a.TerminalMac = b.TerminalMac and a.time = b.time order by a.TerminalMac ; \
insert overwrite local directory '/home/hive/max'ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' select a.* from LogTerminal a inner join(select TerminalMac,max(time) time from LogTerminal group by TerminalMac) b on a.TerminalMac = b.TerminalMac and a.time = b.time order by a.TerminalMac ; "

#将hive数据导出到csv文件
hive -e " set hive.cli.print.header=true; select a.* from LogTerminal a inner join(select TerminalMac,min(time) time from LogTerminal group by TerminalMac) b on a.TerminalMac = b.TerminalMac and a.time = b.time order by a.TerminalMac ; " >> /home/hive/logterminal-min.csv

echo "export hive to local end......."
}

#将hive数据根据查询条件过滤导入到其他表
export_hive(){
echo " export hive to sqlserver start......."
#查询时间最小mac
hive -S -e "INSERT OVERWRITE TABLE LogTerminal_min select a.* from LogTerminal a inner join(select TerminalMac,min(time) time from LogTerminal group by TerminalMac) b on a.TerminalMac = b.TerminalMac and a.time = b.time order by a.TerminalMac ; ";
#查询时间最大mac
hive -S -e "INSERT OVERWRITE TABLE LogTerminal_max select a.* from LogTerminal a inner join(select TerminalMac,max(time) time from LogTerminal group by TerminalMac) b on a.TerminalMac = b.TerminalMac and a.time = b.time order by a.TerminalMac ; ";

echo "export hive to sqlserver end......."
}

main(){
#创建hive表LogTerminal
echo " create hive table LogTerminal......."
hive_table

#生成表结构时间范围
get_time $*
Style="OVERWRITE"
date1=$date
while [[ $date1 < $enddate ]]
do
echo "$date"
##创建临时终端日志表TempLogTerminal关联hbase表,重新运行时删除以前创建表结构,并将查询数据导入LogTerminal
echo " exechive_task....... $date1 $Style $3 "
hive_task $date1 $Style $3
date1=`date -d "+1 day $date1" +%Y%m%d`
Style="INTO"
done
#从hive表LogTerminal日志表导出到本地目录/home/hive
echo " export hive to sqlserver ......."
export_hive
echo " import sqlserver ......."
import_sqlserver
echo " query logterminal end......."
}

main $*


    推荐阅读