[TOC]
所有的安装文件均在目录:/opt/modules/ ,如下图所示:
mysql -u <username> -e "create database quartz" -p
e.g. mysql -u root -e "create database quartz" -p
mysql -u <username> -p quartz < Init_quartz_mysql_innodb.sql
e.g: mysql -u root -p quartz < Init_quartz_mysql_innodb.sql
注意:记得检查是否创建成功,确保每一步都没有错误。
说明hadoop一般已经安装成功,所以现在只需要查看更改里面的配置文件
<configuration>
<property>
<name>fs.defaultFS</name>
<!-- ip:换成实际的 -->
<value>hdfs://ip:9000</value>
</property>
</configuration>
hadoop-3.1.3/data/hadoop-data
创建文件夹nn dn snn,并分发到其他节点 //创建 文件夹
[root@master hadoop-data]# mkdir nn
[root@master hadoop-data]# mkdir dn
[root@master hadoop-data]# mkdir snn
[root@master hadoop-data]# cd ..
//分发 :这里的ip和路径根据实际情况更改
//分发到节点1
[root@master data]# scp -r hadoop-data/ root@10.168.57.11:/opt/modules/hadoop-3.1.3/data/
//分发到节点2
[root@master data]# scp -r hadoop-data/ root@10.168.57.12:/opt/modules/hadoop-3.1.3/data/
查看 ./etc/hadoop/hdfs-site.xml 是否存在以下配置,没有加上。 注意:别忘记配置同步到每个节点上
<configuration>
<!-- 指定Hadoop辅助名称节点主机配置 这里的slave1是节点2ip的映射 -->
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>slave1:50090</value>
</property>
<property>
<name>dfs.namenode.logging.level</name>
<value>warn</value>
</property>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
<!-- nn dn snn 需要自己创建 并将目录改成本地的路径配置 -->
<property>
<name>dfs.namenode.name.dir</name>
<value>/opt/modules/hadoop-3.1.3/data/hadoop-data/nn</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>/opt/modules/hadoop-3.1.3/data/hadoop-data/dn</value>
</property>
<property>
<name>dfs.namenode.checkpoint.dir</name>
<value>/opt/modules/hadoop-3.1.3/data/hadoop-data/snn</value>
</property>
<property>
<name>dfs.webhdfs.enabled</name>
<value>true</value>
</property>
<property>
<name>dfs.datanode.use.datanode.hostname</name>
<value>false</value>
</property>
<property>
<name>dfs.namenode.datanode.registration.ip-hostname-check</name>
<value>false</value>
</property>
</configuration>
//这里的ip和目录要根据实际情况修改
[root@master hadoop]# scp -r hdfs-site.xml root@10.168.57.12:/opt/modules/hadoop-3.1.3/etc/hadoop/
[root@master hadoop]# scp -r hdfs-site.xml root@10.168.57.11:/opt/modules/hadoop-3.1.3/etc/hadoop/
<!--是否启动一个线程检查每个任务正使用的物理内存量,如果任务超出分配值,则直接将其杀掉,默认
是 true -->
<property>
<name>yarn.nodemanager.pmem-check-enabled</name>
<value>false</value>
</property>
<!--是否启动一个线程检查每个任务正使用的虚拟内存量,如果任务超出分配值,则直接将其杀掉,默认
是 true -->
<property>
<name>yarn.nodemanager.vmem-check-enabled</name>
<value>false</value>
</property>
启动HDFS
hadoop namenode -format
cd xxxx
#开启
sbin/start-dfs.sh
# 关闭
sbin/stop-dfs.sh
启动完成后 访问 http://ip:50070/ 如果是hadoop3.0及以上版本 访问 http://ip:9870/
# 注意 本操作是在部署 yarn 的节点上 进行
# 进入到hadoop的安装目录
cd xxxx
sbin/yarn-daemon.sh start resourcemanager
# 关闭
sbin/yarn-daemon.sh stop resourcemanager
启动完成后 访问 http://ip:8088/
# 进入到hadoop的安装目录
cd xxxx
sbin/mr-jobhistory-daemon.sh start historyserver
#关闭
sbin/mr-jobhistory-daemon.sh stop historyserver
启动后访问 http://ip:19888/jobhistory
# 解压
tar -zxvf /opt/software/apache-hive-3.1.2-bin.tar.gz -C /opt/modules/
# 文件重命名
mv /opt/modules/apache-hive-3.1.2-bin/ /opt/modules/hive-3.1.2
#登录Mysql
mysql -uroot -p root
#创建元数据库
create database metastore;
# 初始化 hive 元数据库
schematool -initSchema -dbType mysql -verbose
# 退出mysql
quit;
配置Hive文件
a. 上传 MySQL 的 jdbc 并拷贝到 hive 的 lib 目录下
cp /opt/software/mysql-connector-java5.1.37.jar $HIVE_HOME/lib
b. 新建 hive-site.xml
vim $HIVE_HOME/conf/hive-site.xml
c.在hive-site.xml添加如下内容(注意ip和目录,这个文件已经放入文件夹中)
<?xml version="1.0" encoding="UTF-8" standalone="no"?><?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://10.168.57.10:3306/metastore?useSSL=false&useUnicode=true&characterEncoding=UTF-8</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>root</value>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>root</value>
</property>
<property>
<name>hive.metastore.schema.verification</name>
<value>false</value>
</property>
<property>
<name>hive.cli.print.current.db</name>
<value>true</value>
</property>
<property>
<name>hive.cli.print.header</name>
<value>true</value>
</property>
<!-- 这是hiveserver2 -->
<property>
<name>hive.server2.thrift.port</name>
<value>10000</value>
</property>
<property>
<name>hive.server2.thrift.bind.host</name>
<value>10.168.58.10</value>
</property>
<property>
<name>hive.exec.post.hooks</name>
<value>org.apache.atlas.hive.hook.HiveHook</value>
</property>
<property>
<name>metastore.storage.schema.reader.impl</name>
<value>org.apache.hadoop.hive.metastore.SerDeStorageSchemaReader</value>
</property>
<property>
<name>hive.server2.authentication</name>
<value>NONE</value>
</property>
<property>
<name>dfs.permissions.enabled</name>
<value>false</value>
</property>
<property>
<name>hive.server2.enable.doAs</name>
<value>FALSE</value>
</property>
<!-- hiveserver2的高可用参数,开启此参数可以提高hiveserver2的启动速度 -->
<property>
<name>hive.server2.active.passive.ha.enable</name>
<value>true</value>
</property>
<property>
<name>hive.fetch.task.conversion</name>
<value>more</value>
<description>
Expects one of [none, minimal, more].
Some select queries can be converted to single FETCH task minimizing latency.
Currently the query should be single sourced not having any subquery and should not have
any aggregations or distincts (which incurs RS), lateral views and joins.
0. none : disable hive.fetch.task.conversion
1. minimal : SELECT STAR, FILTER on partition columns, LIMIT only
2. more : SELECT, FILTER, LIMIT only (support TABLESAMPLE and virtual columns) 简单查询不走mapreduce,加快速度
</description>
</property>
<!-- 以下因griffin添加 -->
<property>
<name>hive.exec.local.scratchdir</name>
<value>/opt/modules/hive-3.1.2/temp/hive</value>
<description>Local scratch space for Hive jobs</description>
</property>
<property>
<name>hive.downloaded.resources.dir</name>
<value>/opt/modules/hive-3.1.2/temp/hive/${hive.session.id}_resources</value>
<description>Temporary local directory for added resources in the remote file system.</description>
</property>
<property>
<name>hive.querylog.location</name>
<value>/opt/modules/hive-3.1.2/temp/hive</value>
<description>Location of Hive run time structured log file</description>
</property>
<property>
<name>hive.server2.logging.operation.log.location</name>
<value>/opt/modules/hive-3.1.2/temp/hive/operation_logs</value>
</property>
<!-- 指定存储元数据要连接的地址 -->
<property>
<name>hive.metastore.uris</name>
<value>thrift://127.0.0.1:9083</value>
</property>
<!--元数据存储授权-->
<property>
<name>hive.metastore.event.db.notification.api.auth</name>
<value>false</value>
</property>
<!-- Hive 默认在 HDFS 的工作目录 ,这个我没配置在华为云上-->
<!-- <property>
<name>hive.metastore.warehouse.dir</name>
<value>/user/hive/warehouse</value>
</property>
-->
</configuration>
d. 启动hive元数据
# 启动 hive metastore 注意:启动后窗口不能再操作,需打开一个新的 shell 窗口做别的操作(也可以使用后台运行命令启动)
cd $HIVE_HOME
bin/hive --service metastore
e. 测试hive 是否配置完成
# 使用本地 cli 连接 hive
bin/hive
show databases;
show tables;
说明:由于spark、livy、griffin都对scala版本有要求,所以要求该版本是2.11版本,软件压缩包放在文件夹中
[root@master tmp]# tar -zxvf scala-2.11.6.tgz -C /opt/modules/
# scala
export SCALA_HOME=/opt/modules/scala-2.11.6
export PATH=$PATH:$SCALA_HOME/bin
source /etc/profile
[root@master tmp]# cd $SCALA_HOME
#进入到scala-2.11.6目录下,则说明配置文件生效
[root@master scala-2.11.6]#
说明: 需要修改的文件以及安装包,在spark下
#解压缩spark安装包
tar -zxvf spark-2.2.1-bin-hadoop2.7.tgz -C /opt/modules
#进入modules目录下
cd /opt/modules
#修改名称
mv spark-2.2.1-bin-hadoop2.7 spark-2.2.1
export JAVA_HOME=/usr/java/jdk1.8.0_301
export SCALA_HOME=/opt/modules/scala-2.11.6
YARN_CONF_DIR=/opt/modules/hadoop-3.1.3/etc/hadoop
HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
SPARK_MASTER_HOST=master
SPARK_MASTER_PORT=7077
SPARK_MASTER_WEBUI_PORT=8082
SPARK_LOCAL_IP=10.168.57.10
SPARK_PID_DIR=/opt/modules/spark-2.2.1/pids
启动Hadoop的HDFS(之前有启动就不需要管了)
启动spark
# 进入到 spark-yarn 安装路径
cd xxx
sbin/start-all.sh
启动后 访问 http://ip:8082/
bin/spark-submit \
--class org.apache.spark.examples.SparkPi \
--master yarn \
--deploy-mode client \
./examples/jars/spark-examples_2.11-2.2.1.jar \
10
查看yarn: http://hadoop113:8088/
a. 修改 conf/spark-default.conf
spark.master yarn-cluster
spark.serializer org.apache.spark.serializer.KryoSerializer
spark.yarn.jars hdfs:///home/spark_lib/*
spark.yarn.dist.files hdfs:///home/spark_conf/hive-site.xml
spark.sql.broadcastTimeout 500
b. 修改 conf/spark-env.sh
HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
SPARK_MASTER_HOST=master
SPARK_MASTER_PORT=7077
SPARK_MASTER_WEBUI_PORT=8082
SPARK_LOCAL_IP=localhost
SPARK_PID_DIR=/opt/modules/spark-yarn/pids
c. 在Hadoop中创建文件夹,并上传文件
#创建文件夹
hdfs dfs -mkdir /home/spark_lib
hdfs dfs -mkdir /home/spark_conf
#上传文件
hdfs dfs -put /opt/modules/spark-yarn/jars/* hdfs:///home/spark_lib/
hdfs dfs -put /opt/modules/hive-3.1.2/conf/hive-site.xml hdfs:///home/spark_conf/
上传、解压安装包
解压后,进入 conf 目录 ,拷贝 livy.conf.template 为 livy.conf,添加如下内容
# 配置Livy会话所使用的Spark集群运行模式
livy.server.host = 43.143.224.212
livy.spark.master = yarn
# # 配置Livy会话所使用的Spark集群部署模式
livy.spark.deploy-mode = cluster
#livy.spark.deployMode = cluster
# //默认使用hiveContext
livy.repl.enableHiveContext = true
# //开启用户代理
livy.impersonation.enabled = true
# //设置session空闲过期时间
livy.server.session.timeout = 1h
#
livy.server.port = 8998
# 路径需要修改为自己的
export SPARK_HOME=/opt/modules/spark-yarn
export HADOOP_CONF_DIR=/opt/modules/hadoop-3.1.3/etc/hadoop
mkdir /xxxxx/livy/logs
# 进入到 livy 的安装路径
cd xxx
bin/livy-server start
# 关闭
bin/livy-server stop
启动后 访问 http://ip:8998/ui
a. 创建session
curl -XPOST 'http://10.168.57.10:8998/sessions' -H "Content-Type:application/json" --data '{"kind":"spark"}'
注意:待到 livy server 的状态转换成idle的时候,向其发送请求,才会去执行。执行时,其状态转变成busy;执行完毕之后,状态又会变成idle。
b. 在当前文件下创建 hello.txt 并上传到 hdfs 中
vim hello.txt
nihao spark
nihao scala
hello livy
hadoop fs -mkdir /livydemo
hadoop fs -put ./hello.txt /livydemo
c. 提交任务
curl -XPOST 'http://10.168.57.10:8998/sessions/0/statements' -H 'Content-Type:application/json' -d '{"code":"sc.textFile(\"hdfs:///livy/data/word.txt\").flatMap(_.split(\" \")).map((_,1)).reduceByKey(_+_).saveAsTextFile(\"hdfs:///livy/result/1\")"}'
进入 yarn 查看运行情况 http://ip:8088/
# 解压缩
tar -zxvf elasticsearch-5.0.0.tar.gz -C /opt/modules
# 改名
mv elasticsearch-5.0.0 es-cluster
cluster.name: es-cluster #集群名,不同名称代表不同集群
node.name: master #节点名称,自定义
path.data: /opt/modules/es-cluster/es/data
path.logs: /opt/modules/es-cluster/es/logs #日志路径
bootstrap.memory_lock: false #关闭锁内存
network.host: 10.168.57.10 #绑定IP地址,如果IP有别名可以尝试别名,数字ip可能会报错
http.port: 9200 #绑定端口
discovery.zen.ping.unicast.hosts: ["10.168.57.10", "10.168.57.11", "10.168.57.12"] #集群列表,类型数组,可以是IP或域名
discovery.zen.minimum_master_nodes: 2 #节点数不能超过节点总数量(防止脑裂两台可配置成1,三台可配置2)
http.cors.enabled: true #开启http网络节点发现
http.cors.allow-origin: "*" #允许所有同网段节点发现
#建立es用户管理 es
[root@master modules]# cd es-cluster/
[root@master elasticsearch-5.0.0]# groupadd es
[root@master elasticsearch-5.0.0]# useradd es -g es -p codingwhy
[root@master elasticsearch-5.0.0]# chown -R es:es es-cluster
chown: cannot access ‘elasticsearch-5.0.0’: No such file or directory
[root@master elasticsearch-5.0.0]# cd ..
[root@master modules]# chown -R es:es es-cluster
修改两个配置
第一个,Linux系统的soft、hard值配置过低,至少65536;第二个,Linux系统vm.max_map_count值配置过低,至少262144
vim /etc/security/limits.conf
* soft nofile 65536
* hard nofile 65536
vim /etc/sysctl.conf
#添加
vm.max_map_count=262144
#sysctl -p 生效
[root@localhost es]# sysctl -p
su es
#前台运行,但运行后不能进行其他操作
elastic/bin/elasticsearch
# 后台运行,前台不显示
elastic/bin/elasticsearch -d
curl -k -H "Content-Type: application/json" -X PUT http://192.168.100.157:9200/griffin \
-d '{
"aliases": {},
"mappings": {
"accuracy": {
"properties": {
"name": {
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
},
"type": "text"
},
"tmst": {
"type": "date"
}
}
}
},
"settings": {
"index": {
"number_of_replicas": "2",
"number_of_shards": "5"
}
}
}'
说明:在文件夹里总共有两个文件一个是 measure-0.6.0.jar
和service-0.6.0.tar.gz
,一般情况下只需要修改service-0.6.0.tar.gz
里的相关配置,大多数是路径名和ip需要进行修改,现在把各个需要修改的文件内容放出来,便于以后的部署,这些需要修改的信息,全部都在config下,下面序号展示的目录是在源码中的位置,如果仅修改config的文件不能解决问题,则需要重新编译
主要是修改相关ip、路径、账号密码
# Apache Griffin server port (default 8080)
server.port = 8091
spring.application.name=griffin_service
spring.datasource.url=jdbc:mysql://10.168.57.10:3306/quartz?useSSL=false
spring.datasource.username=root
spring.datasource.password=root
spring.jpa.generate-ddl=true
spring.datasource.driver-class-name=com.mysql.jdbc.Driver
spring.jpa.show-sql=true
# Hive metastore
hive.metastore.uris=thrift://10.168.57.10:9083
hive.metastore.dbname=default
hive.hmshandler.retry.attempts=15
hive.hmshandler.retry.interval=2000ms
#Hive jdbc
hive.jdbc.className=org.apache.hive.jdbc.HiveDriver
hive.jdbc.url=jdbc:hive2://10.168.57.10:10000/
hive.need.kerberos=false
hive.keytab.user=xxx@xx.com
hive.keytab.path=/path/to/keytab/file
# Hive cache time
cache.evict.hive.fixedRate.in.milliseconds=900000
# Kafka schema registry
kafka.schema.registry.url=http://localhost:8081
# Update job instance state at regular intervals
jobInstance.fixedDelay.in.milliseconds=60000
# Expired time of job instance which is 7 days that is 604800000 milliseconds.Time unit only supports milliseconds
jobInstance.expired.milliseconds=604800000
# schedule predicate job every 5 minutes and repeat 12 times at most
# interval time unit s:second m:minute h:hour d:day,only support these four units
predicate.job.interval=5m
predicate.job.repeat.count=12
# external properties directory location
external.config.location=
# external BATCH or STREAMING env
external.env.location=
# login strategy ("default" or "ldap")
login.strategy=default
# ldap
ldap.url=ldap://hostname:port
ldap.email=@example.com
ldap.searchBase=DC=org,DC=example
ldap.searchPattern=(sAMAccountName={0})
# hdfs default name
fs.defaultFS=
# elasticsearch
elasticsearch.host=10.168.57.10
elasticsearch.port=9200
elasticsearch.scheme=http
# elasticsearch.user = user
# elasticsearch.password = password
# livy
livy.uri=http://10.168.57.10:8998/batches
livy.need.queue=false
livy.task.max.concurrent.count=20
livy.task.submit.interval.second=3
livy.task.appId.retry.count=3
livy.need.kerberos=false
livy.server.auth.kerberos.principal=livy/kerberos.principal
livy.server.auth.kerberos.keytab=/path/to/livy/keytab/file
# yarn url
yarn.uri=http://10.168.57.11:8088
# griffin event listener
internal.event.listeners=GriffinJobEventHook
logging.file=logs/griffin-service.log
org.quartz.scheduler.instanceName=spring-boot-quartz
org.quartz.scheduler.instanceId=AUTO
org.quartz.threadPool.threadCount=5
org.quartz.jobStore.class=org.quartz.impl.jdbcjobstore.JobStoreTX
# If you use postgresql as your database,set this property value to org.quartz.impl.jdbcjobstore.PostgreSQLDelegate
# If you use mysql as your database,set this property value to org.quartz.impl.jdbcjobstore.StdJDBCDelegate
# If you use h2 as your database, it's ok to set this property value to StdJDBCDelegate, PostgreSQLDelegate or others
#主要是这里要修改,如果用的mysql就是以下这个
org.quartz.jobStore.driverDelegateClass=org.quartz.impl.jdbcjobstore.StdJDBCDelegate
org.quartz.jobStore.useProperties=true
org.quartz.jobStore.misfireThreshold=60000
org.quartz.jobStore.tablePrefix=QRTZ_
org.quartz.jobStore.isClustered=true
org.quartz.jobStore.clusterCheckinInterval=20000
这里主要是hdoop文件的相关路径,一般咱们路径就是这个不需要修改
{
"file": "hdfs:///griffin/griffin-measure.jar",
"className": "org.apache.griffin.measure.Application",
"queue": "default",
"numExecutors": 2,
"executorCores": 1,
"driverMemory": "1g",
"executorMemory": "1g",
"conf": {
"spark.yarn.dist.files": "hdfs:///home/spark_conf/hive-site.xml"
},
"files": [
]
}
{
"spark": {
"log.level": "WARN"
},
"sinks": [
{
"name": "console",
"type": "CONSOLE",
"config": {
"max.log.lines": 10
}
},
{
"name": "hdfs",
"type": "HDFS",
"config": {
"path": "hdfs:///griffin/persist",
"max.persist.lines": 10000,
"max.lines.per.file": 10000
}
},
{
"name": "elasticsearch",
"type": "ELASTICSEARCH",
"config": {
"method": "post",
"api": "http://10.168.57.10:9200/griffin/accuracy",
"connection.timeout": "1m",
"retry": 10
}
}
],
"griffin.checkpoint": []
}
{
"spark": {
"log.level": "WARN",
"checkpoint.dir": "hdfs:///griffin/checkpoint/${JOB_NAME}",
"init.clear": true,
"batch.interval": "1m",
"process.interval": "5m",
"config": {
"spark.default.parallelism": 4,
"spark.task.maxFailures": 5,
"spark.streaming.kafkaMaxRatePerPartition": 1000,
"spark.streaming.concurrentJobs": 4,
"spark.yarn.maxAppAttempts": 5,
"spark.yarn.am.attemptFailuresValidityInterval": "1h",
"spark.yarn.max.executor.failures": 120,
"spark.yarn.executor.failuresValidityInterval": "1h",
"spark.hadoop.fs.hdfs.impl.disable.cache": true
}
},
"sinks": [
{
"type": "CONSOLE",
"config": {
"max.log.lines": 100
}
},
{
"type": "HDFS",
"config": {
"path": "hdfs:///griffin/persist",
"max.persist.lines": 10000,
"max.lines.per.file": 10000
}
},
{
"type": "ELASTICSEARCH",
"config": {
"method": "post",
"api": "http://10.168.57.10:9200/griffin/accuracy"
}
}
],
"griffin.checkpoint": [
{
"type": "zk",
"config": {
"hosts": "zk:2181",
"namespace": "griffin/infocache",
"lock.path": "lock",
"mode": "persist",
"init.clear": true,
"close.clear": false
}
}
]
}
# 修改jar包名称
mv measure-0.6.0.jar griffin-measure.jar
# 上传jar包到hadoop中griffin的目录夹下,如果没有该目录夹需要创建
hdfs dfs -put griffin-measure.jar /griffin/
#补充创建griffin文件夹的命令
#hdfs dfs -mkdir /griffin
#进入service目录文件下
tar -zxvf target/service-0.6.0.tar.gz -C /opt/modules
cd service-0.6.0
# 服务开始
./bin/griffin.sh start
# or use ./bin/start.sh
# 服务停止
./bin/griffin.sh stop
# or use ./bin/stop.sh
访问界面
http://<your IP>:<your port>