1 yıl önce · eb853058b2
--- a/算法平台项目/算法平台安装文档/数据探查服务安装文档（成功安装在华为云版）.md
+++ b/算法平台项目/算法平台安装文档/数据探查服务安装文档（成功安装在华为云版）.md
@@ -0,0 +1,984 @@
 
				+# 数据探查服务安装(华为云)文档

			
 
				+

			
 
				+[TOC]

			
 
				+

			
 
				+## 1. 总述

			
 
				+

			
 
				+### 1.1 安装的软件信息及版本

			
 
				+

			
 
				+1. JDK 1.8

			
 
				+2. MySQL 5.7.28

			
 
				+3. hadoop 3.1.3

			
 
				+4. hive 3.1.2

			
 
				+5. scala 2.11.6

			
 
				+6. spark-2.2.1

			
 
				+7. livy-0.7.1

			
 
				+8. ElasticSearch-5.0.0

			
 
				+9. Griffin-0.6.0

			
 
				+

			
 
				+### 1.2 安装文件路径

			
 
				+

			
 
				+所有的安装文件均在目录：/opt/modules/     ，如下图所示：![image-20220606152613846](图片/image-20220606152613846-16545003762937.png)

			
 
				+

			
 
				+    

			
 
				+

			
 
				+## 2、软件安装

			
 
				+

			
 
				+### 2.1 MySQL的配置

			
 
				+

			
 
				+* 创建数据库并导入官方提供的SQL文件：Init_quartz_mysql_innodb.sql

			
 
				+

			
 
				+1. 创建数据库quartz

			
 
				+

			
 
				+```shell

			
 
				+mysql -u <username> -e "create database quartz" -p

			
 
				+e.g. mysql -u root -e "create database quartz" -p

			
 
				+```

			
 
				+

			
 
				+2. 导入SQL文件

			
 
				+

			
 
				+```shell

			
 
				+mysql -u <username> -p quartz < Init_quartz_mysql_innodb.sql

			
 
				+e.g: mysql -u root -p quartz < Init_quartz_mysql_innodb.sql

			
 
				+```

			
 
				+

			
 
				+注意：记得检查是否创建成功，确保每一步都没有错误。

			
 
				+

			
 
				+### 2.2 Hadoop部署

			
 
				+

			
 
				+说明hadoop一般已经安装成功，所以现在只需要查看更改里面的配置文件

			
 
				+

			
 
				+1. 查看**./etc/hadoop/core-site.xml**  是否存在以下配置，没有加上。 注意：别忘记配置同步到每个节点上

			
 
				+

			
 
				+```xml

			
 
				+<configuration>

			
 
				+    <property>

			
 
				+        <name>fs.defaultFS</name>

			
 
				+          <!--  ip:换成实际的     -->

			
 
				+        <value>hdfs://ip:9000</value>  

			
 
				+    </property>    

			
 
				+</configuration>

			
 
				+```

			
 
				+

			
 
				+2. 在hadoop安装目录的data下即：`hadoop-3.1.3/data/hadoop-data`  创建文件夹nn dn snn，并分发到其他节点

			
 
				+

			
 
				+   ```shell

			
 
				+   //创建 文件夹

			
 
				+   [root@master hadoop-data]# mkdir nn

			
 
				+   [root@master hadoop-data]# mkdir dn

			
 
				+   [root@master hadoop-data]# mkdir snn

			
 
				+   [root@master hadoop-data]# cd ..

			
 
				+   //分发 :这里的ip和路径根据实际情况更改

			
 
				+   //分发到节点1

			
 
				+   [root@master data]# scp -r hadoop-data/ root@10.168.57.11:/opt/modules/hadoop-3.1.3/data/

			
 
				+   //分发到节点2

			
 
				+   [root@master data]# scp -r hadoop-data/ root@10.168.57.12:/opt/modules/hadoop-3.1.3/data/

			
 
				+   ```

			
 
				+

			
 
				+3. 查看  **./etc/hadoop/hdfs-site.xml**  是否存在以下配置，没有加上。  注意：别忘记配置同步到每个节点上

			
 
				+

			
 
				+   1. 配置**hdfs-site.xml**文件

			
 
				+

			
 
				+   ```xml

			
 
				+   <configuration>

			
 
				+   <!-- 指定Hadoop辅助名称节点主机配置   这里的slave1是节点2ip的映射 -->

			
 
				+   <property>

			
 
				+         <name>dfs.namenode.secondary.http-address</name>

			
 
				+         <value>slave1:50090</value>

			
 
				+   </property>

			
 
				+   

			
 
				+       <property>

			
 
				+           <name>dfs.namenode.logging.level</name>

			
 
				+           <value>warn</value>

			
 
				+       </property>

			
 
				+       <property>

			
 
				+           <name>dfs.replication</name>

			
 
				+           <value>1</value>

			
 
				+       </property>

			
 
				+       <!--  nn dn snn 需要自己创建 并将目录改成本地的路径配置   -->

			
 
				+   <property>

			
 
				+           <name>dfs.namenode.name.dir</name>

			
 
				+           <value>/opt/modules/hadoop-3.1.3/data/hadoop-data/nn</value>

			
 
				+       </property>

			
 
				+       <property>

			
 
				+           <name>dfs.datanode.data.dir</name>

			
 
				+           <value>/opt/modules/hadoop-3.1.3/data/hadoop-data/dn</value>

			
 
				+       </property>

			
 
				+       <property>

			
 
				+           <name>dfs.namenode.checkpoint.dir</name>

			
 
				+           <value>/opt/modules/hadoop-3.1.3/data/hadoop-data/snn</value>

			
 
				+       </property>

			
 
				+       <property>

			
 
				+           <name>dfs.webhdfs.enabled</name>

			
 
				+           <value>true</value>

			
 
				+       </property>

			
 
				+       <property>

			
 
				+           <name>dfs.datanode.use.datanode.hostname</name>

			
 
				+           <value>false</value>

			
 
				+       </property>

			
 
				+       <property>

			
 
				+           <name>dfs.namenode.datanode.registration.ip-hostname-check</name>

			
 
				+           <value>false</value>

			
 
				+       </property>

			
 
				+   </configuration>

			
 
				+   ```

			
 
				+

			
 
				+   2. 分发hdfs-site.xml到其他节点

			
 
				+

			
 
				+   ```shell

			
 
				+   //这里的ip和目录要根据实际情况修改

			
 
				+   [root@master hadoop]# scp -r hdfs-site.xml root@10.168.57.12:/opt/modules/hadoop-3.1.3/etc/hadoop/

			
 
				+    

			
 
				+   [root@master hadoop]# scp -r hdfs-site.xml root@10.168.57.11:/opt/modules/hadoop-3.1.3/etc/hadoop/

			
 
				+   ```

			
 
				+

			
 
				+4. 修改 hadoop 配置文件/opt/modules/hadoop-3.1.3/etc/hadoop/yarn-site.xml, 并同步至所有节点(yarn-site.xml文件放到了hadoop文件下)

			
 
				+

			
 
				+   ```xml

			
 
				+   <!--是否启动一个线程检查每个任务正使用的物理内存量，如果任务超出分配值，则直接将其杀掉，默认

			
 
				+   是 true -->

			
 
				+   <property>

			
 
				+    <name>yarn.nodemanager.pmem-check-enabled</name>

			
 
				+    <value>false</value>

			
 
				+   </property>

			
 
				+   <!--是否启动一个线程检查每个任务正使用的虚拟内存量，如果任务超出分配值，则直接将其杀掉，默认

			
 
				+   是 true -->

			
 
				+   <property>

			
 
				+    <name>yarn.nodemanager.vmem-check-enabled</name>

			
 
				+    <value>false</value>

			
 
				+   </property>

			
 
				+   ```

			
 
				+

			
 
				+5. 启动HDFS

			
 
				+

			
 
				+   1. 首先格式化（前面修改配置文件及添加了一些文件夹，需要对其进行格式化）

			
 
				+

			
 
				+      ```shell

			
 
				+      hadoop namenode -format

			
 
				+      ```

			
 
				+

			
 
				+   2. 进入安装目录并启动 

			
 
				+

			
 
				+     

			
 
				+      ```shell

			
 
				+      cd xxxx

			
 
				+      #开启

			
 
				+      sbin/start-dfs.sh

			
 
				+      

			
 
				+      # 关闭

			
 
				+      sbin/stop-dfs.sh

			
 
				+      ```

			
 
				+

			
 
				+               启动完成后 访问  http://ip:50070/  如果是hadoop3.0及以上版本  访问  http://ip:9870/	![image-20220606152716058](图片/image-20220606152716058-16545004380689.png)

			
 
				+

			
 
				+

			
 
				+

			
 
				+5. 启动yarn

			
 
				+

			
 
				+   ```shell

			
 
				+   # 注意 本操作是在部署 yarn 的节点上 进行

			
 
				+   # 进入到hadoop的安装目录

			
 
				+   cd xxxx

			
 
				+   sbin/yarn-daemon.sh start resourcemanager

			
 
				+   

			
 
				+   # 关闭

			
 
				+   sbin/yarn-daemon.sh stop resourcemanager

			
 
				+   ```

			
 
				+

			
 
				+   启动完成后 访问 http://ip:8088/    ![image-20220606152736734](图片/image-20220606152736734-165450045827211.png)

			
 
				+

			
 
				+6. （可选:在华为云中未配置） 启动历史服务器 

			
 
				+

			
 
				+   ```shell

			
 
				+   # 进入到hadoop的安装目录

			
 
				+   cd xxxx

			
 
				+   sbin/mr-jobhistory-daemon.sh start historyserver

			
 
				+   

			
 
				+   #关闭

			
 
				+   sbin/mr-jobhistory-daemon.sh stop historyserver

			
 
				+   ```

			
 
				+

			
 
				+   启动后访问 http://ip:19888/jobhistory  ![image-20220606152901770](图片/image-20220606152901770-165450054331613.png)

			
 
				+

			
 
				+

			
 
				+

			
 
				+### 2.3 hive部署

			
 
				+

			
 
				+1. 将 apache-hive-3.1.2-bin.tar.gz 上传到 系统中，并解压到指定目录

			
 
				+

			
 
				+   ```shell

			
 
				+   # 解压

			
 
				+   tar -zxvf /opt/software/apache-hive-3.1.2-bin.tar.gz -C /opt/modules/

			
 
				+   

			
 
				+   # 文件重命名

			
 
				+   mv /opt/modules/apache-hive-3.1.2-bin/   /opt/modules/hive-3.1.2

			
 
				+   ```

			
 
				+

			
 
				+2. 检查Mysql的安装，并登录mysql

			
 
				+

			
 
				+   ```shell

			
 
				+   #登录Mysql

			
 
				+   mysql -uroot -p root

			
 
				+   #创建元数据库

			
 
				+   create database metastore;

			
 
				+   # 初始化 hive 元数据库

			
 
				+    schematool -initSchema -dbType mysql -verbose

			
 
				+   # 退出mysql

			
 
				+    quit;

			
 
				+   ```

			
 
				+

			
 
				+3. 配置Hive文件

			
 
				+

			
 
				+     a. 上传 MySQL 的 jdbc 并拷贝到 hive 的 lib 目录下

			
 
				+

			
 
				+   ```shell

			
 
				+   cp /opt/software/mysql-connector-java5.1.37.jar $HIVE_HOME/lib

			
 
				+   ```

			
 
				+

			
 
				+     b. 新建 hive-site.xml 

			
 
				+

			
 
				+   ```shell

			
 
				+    vim $HIVE_HOME/conf/hive-site.xml

			
 
				+   ```

			
 
				+

			
 
				+   c.在hive-site.xml添加如下内容(注意ip和目录,这个文件已经放入文件夹中)

			
 
				+

			
 
				+   ```xml

			
 
				+   <?xml version="1.0" encoding="UTF-8" standalone="no"?><?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

			
 
				+   <configuration>

			
 
				+           <property>

			
 
				+                   <name>javax.jdo.option.ConnectionURL</name>

			
 
				+                   <value>jdbc:mysql://10.168.57.10:3306/metastore?useSSL=false&amp;useUnicode=true&amp;characterEncoding=UTF-8</value>

			
 
				+           </property>

			
 
				+   

			
 
				+           <property>

			
 
				+                   <name>javax.jdo.option.ConnectionDriverName</name>

			
 
				+                   <value>com.mysql.jdbc.Driver</value>

			
 
				+           </property>

			
 
				+   

			
 
				+           <property>

			
 
				+                   <name>javax.jdo.option.ConnectionUserName</name>

			
 
				+                   <value>root</value>

			
 
				+           </property>

			
 
				+   

			
 
				+           <property>

			
 
				+                   <name>javax.jdo.option.ConnectionPassword</name>

			
 
				+                   <value>root</value>

			
 
				+           </property>

			
 
				+   

			
 
				+           <property>

			
 
				+                   <name>hive.metastore.schema.verification</name>

			
 
				+                   <value>false</value>

			
 
				+           </property>

			
 
				+   	<property> 

			
 
				+      	        <name>hive.cli.print.current.db</name>

			
 
				+   	        <value>true</value>

			
 
				+   	</property>

			
 
				+   	<property> 

			
 
				+   	        <name>hive.cli.print.header</name>

			
 
				+   	        <value>true</value>

			
 
				+   	</property>

			
 
				+   	<!-- 这是hiveserver2 -->

			
 
				+   	<property>

			
 
				+          	        <name>hive.server2.thrift.port</name>

			
 
				+        		<value>10000</value>

			
 
				+   	</property>

			
 
				+   

			
 
				+       	<property>

			
 
				+          		<name>hive.server2.thrift.bind.host</name>

			
 
				+          		<value>10.168.58.10</value>

			
 
				+        	</property>

			
 
				+   

			
 
				+       	<property>

			
 
				+   		<name>hive.exec.post.hooks</name>

			
 
				+          		<value>org.apache.atlas.hive.hook.HiveHook</value>

			
 
				+        	</property>

			
 
				+   <property>

			
 
				+   <name>metastore.storage.schema.reader.impl</name>

			
 
				+   <value>org.apache.hadoop.hive.metastore.SerDeStorageSchemaReader</value>

			
 
				+   </property>

			
 
				+   <property>

			
 
				+        <name>hive.server2.authentication</name>

			
 
				+        <value>NONE</value>

			
 
				+   </property>

			
 
				+   

			
 
				+   <property>

			
 
				+      <name>dfs.permissions.enabled</name>

			
 
				+      <value>false</value>

			
 
				+   </property>

			
 
				+   

			
 
				+   <property>

			
 
				+        <name>hive.server2.enable.doAs</name>

			
 
				+        <value>FALSE</value>

			
 
				+   </property>

			
 
				+   

			
 
				+   <!-- hiveserver2的高可用参数，开启此参数可以提高hiveserver2的启动速度 -->

			
 
				+   <property>

			
 
				+       <name>hive.server2.active.passive.ha.enable</name>

			
 
				+       <value>true</value>

			
 
				+   </property>

			
 
				+   <property>

			
 
				+     <name>hive.fetch.task.conversion</name>

			
 
				+     <value>more</value>

			
 
				+     <description>

			
 
				+       Expects one of [none, minimal, more].

			
 
				+       Some select queries can be converted to single FETCH task minimizing latency.

			
 
				+       Currently the query should be single sourced not having any subquery and should not have

			
 
				+       any aggregations or distincts (which incurs RS), lateral views and joins.

			
 
				+       0. none : disable hive.fetch.task.conversion

			
 
				+       1. minimal : SELECT STAR, FILTER on partition columns, LIMIT only

			
 
				+       2. more    : SELECT, FILTER, LIMIT only (support TABLESAMPLE and virtual columns) 简单查询不走mapreduce，加快速度

			
 
				+     </description>

			
 
				+   </property>

			
 
				+   <!-- 以下因griffin添加 -->

			
 
				+     <property>

			
 
				+        <name>hive.exec.local.scratchdir</name>

			
 
				+        <value>/opt/modules/hive-3.1.2/temp/hive</value>

			
 
				+        <description>Local scratch space for Hive jobs</description>

			
 
				+      </property>

			
 
				+      <property>

			
 
				+        <name>hive.downloaded.resources.dir</name>

			
 
				+        <value>/opt/modules/hive-3.1.2/temp/hive/${hive.session.id}_resources</value>

			
 
				+        <description>Temporary local directory for added resources in the remote file system.</description>

			
 
				+      </property>

			
 
				+      <property>

			
 
				+        <name>hive.querylog.location</name>

			
 
				+        <value>/opt/modules/hive-3.1.2/temp/hive</value>

			
 
				+        <description>Location of Hive run time structured log file</description>

			
 
				+      </property>

			
 
				+       <property>

			
 
				+       <name>hive.server2.logging.operation.log.location</name>

			
 
				+        <value>/opt/modules/hive-3.1.2/temp/hive/operation_logs</value>

			
 
				+      </property>

			
 
				+       <!-- 指定存储元数据要连接的地址 -->

			
 
				+      <property>

			
 
				+      <name>hive.metastore.uris</name>

			
 
				+       <value>thrift://127.0.0.1:9083</value>

			
 
				+      </property>

			
 
				+   

			
 
				+       <!--元数据存储授权-->

			
 
				+      <property>

			
 
				+        <name>hive.metastore.event.db.notification.api.auth</name>

			
 
				+        <value>false</value>

			
 
				+      </property>

			
 
				+       

			
 
				+        <!-- Hive 默认在 HDFS 的工作目录 ，这个我没配置在华为云上-->

			
 
				+    <!-- <property>

			
 
				+       <name>hive.metastore.warehouse.dir</name>

			
 
				+       <value>/user/hive/warehouse</value>

			
 
				+    </property>

			
 
				+   -->

			
 
				+   </configuration>

			
 
				+   ```

			
 
				+

			
 
				+   d. 启动hive元数据

			
 
				+

			
 
				+   ```shell

			
 
				+   # 启动 hive metastore  注意：启动后窗口不能再操作，需打开一个新的 shell 窗口做别的操作(也可以使用后台运行命令启动)

			
 
				+   cd $HIVE_HOME

			
 
				+   bin/hive --service metastore  

			
 
				+   ```

			
 
				+

			
 
				+   e. 测试hive 是否配置完成

			
 
				+

			
 
				+   ```shell

			
 
				+   # 使用本地 cli 连接 hive 

			
 
				+   bin/hive

			
 
				+   show databases;

			
 
				+   show tables;

			
 
				+   ```

			
 
				+

			
 
				+### 2.4 Scala部署

			
 
				+

			
 
				+**说明：**由于spark、livy、griffin都对scala版本有要求，所以要求该版本是2.11版本，软件压缩包放在文件夹中

			
 
				+

			
 
				+1. 解压scala压缩包,并放入/opt/modules文件夹中

			
 
				+

			
 
				+   ```shell

			
 
				+   [root@master tmp]# tar -zxvf scala-2.11.6.tgz -C /opt/modules/

			
 
				+   ```

			
 
				+

			
 
				+2. 修改环境变量配置文件

			
 
				+

			
 
				+   ```shell

			
 
				+   # scala

			
 
				+   export SCALA_HOME=/opt/modules/scala-2.11.6

			
 
				+   export PATH=$PATH:$SCALA_HOME/bin

			
 
				+   ```

			
 
				+

			
 
				+3. 生效配置文件

			
 
				+

			
 
				+   ```shell

			
 
				+   source /etc/profile  

			
 
				+   ```

			
 
				+

			
 
				+4. 检查配置文件是否生效

			
 
				+

			
 
				+   ```shell

			
 
				+   [root@master tmp]# cd $SCALA_HOME

			
 
				+   #进入到scala-2.11.6目录下，则说明配置文件生效

			
 
				+   [root@master scala-2.11.6]# 

			
 
				+   ```

			
 
				+

			
 
				+   

			
 
				+

			
 
				+### 2.5 Spark (yarn 模式部署)

			
 
				+

			
 
				+**说明：** 需要修改的文件以及安装包，在spark下

			
 
				+

			
 
				+1. 将 spark-2.2.1-bin-hadoop2.7.tgz 文件上传到 linux 并解压缩，放置在指定位置。

			
 
				+

			
 
				+   ```shell

			
 
				+   #解压缩spark安装包

			
 
				+   tar -zxvf spark-2.2.1-bin-hadoop2.7.tgz -C /opt/modules

			
 
				+   #进入modules目录下

			
 
				+   cd /opt/modules

			
 
				+   #修改名称

			
 
				+   mv spark-2.2.1-bin-hadoop2.7 spark-2.2.1

			
 
				+   ```

			
 
				+

			
 
				+2. 修改conf/spark-env.sh，添加以下配置（根据电脑的实际情况进行修改）

			
 
				+

			
 
				+   ```shell

			
 
				+   export JAVA_HOME=/usr/java/jdk1.8.0_301

			
 
				+   export SCALA_HOME=/opt/modules/scala-2.11.6

			
 
				+   YARN_CONF_DIR=/opt/modules/hadoop-3.1.3/etc/hadoop

			
 
				+   HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop

			
 
				+   SPARK_MASTER_HOST=master

			
 
				+   SPARK_MASTER_PORT=7077

			
 
				+   SPARK_MASTER_WEBUI_PORT=8082

			
 
				+   SPARK_LOCAL_IP=10.168.57.10

			
 
				+   SPARK_PID_DIR=/opt/modules/spark-2.2.1/pids

			
 
				+   ```

			
 
				+

			
 
				+3. 启动Hadoop的HDFS（之前有启动就不需要管了）

			
 
				+

			
 
				+4. 启动spark

			
 
				+

			
 
				+   ```shell

			
 
				+   # 进入到 spark-yarn 安装路径 

			
 
				+   cd  xxx

			
 
				+   sbin/start-all.sh

			
 
				+   ```

			
 
				+

			
 
				+   启动后 访问 http://ip:8082/ ![image-20220606170004690](图片/image-20220606170004690.png)

			
 
				+

			
 
				+5. 测试 提交应用,验证是否安装成功

			
 
				+

			
 
				+   ```shell

			
 
				+   bin/spark-submit \

			
 
				+   --class org.apache.spark.examples.SparkPi \

			
 
				+   --master yarn \

			
 
				+   --deploy-mode client \

			
 
				+   ./examples/jars/spark-examples_2.11-2.2.1.jar \

			
 
				+   10

			
 
				+   ```

			
 
				+

			
 
				+   ![image-20220606173207637](图片/image-20220606173207637.png)

			
 
				+

			
 
				+   查看yarn： http://hadoop113:8088/

			
 
				+

			
 
				+   ![image-20220606173226832](图片/image-20220606173226832.png)

			
 
				+

			
 
				+   

			
 
				+

			
 
				+6. 配置数据探测服务

			
 
				+

			
 
				+   a. 修改 conf/spark-default.conf 

			
 
				+

			
 
				+   ```shell

			
 
				+   spark.master                    yarn-cluster

			
 
				+   spark.serializer                org.apache.spark.serializer.KryoSerializer

			
 
				+   spark.yarn.jars                 hdfs:///home/spark_lib/*

			
 
				+   spark.yarn.dist.files		hdfs:///home/spark_conf/hive-site.xml

			
 
				+   spark.sql.broadcastTimeout  500

			
 
				+   ```

			
 
				+

			
 
				+   b. 修改  conf/spark-env.sh 

			
 
				+

			
 
				+   ```shell

			
 
				+   HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop

			
 
				+   SPARK_MASTER_HOST=master

			
 
				+   SPARK_MASTER_PORT=7077

			
 
				+   SPARK_MASTER_WEBUI_PORT=8082

			
 
				+   SPARK_LOCAL_IP=localhost

			
 
				+   SPARK_PID_DIR=/opt/modules/spark-yarn/pids

			
 
				+   ```

			
 
				+

			
 
				+   c. 在Hadoop中创建文件夹，并上传文件

			
 
				+

			
 
				+   ```shell

			
 
				+   #创建文件夹

			
 
				+   hdfs dfs -mkdir /home/spark_lib

			
 
				+   hdfs dfs -mkdir /home/spark_conf

			
 
				+   #上传文件

			
 
				+   hdfs dfs -put /opt/modules/spark-yarn/jars/*  hdfs:///home/spark_lib/

			
 
				+   hdfs dfs -put /opt/modules/hive-3.1.2/conf/hive-site.xml hdfs:///home/spark_conf/

			
 
				+   ```

			
 
				+

			
 
				+   

			
 
				+

			
 
				+### 2.6 livy安装

			
 
				+

			
 
				+1. 上传、解压安装包

			
 
				+

			
 
				+2. 解压后，进入 conf 目录 ，拷贝 livy.conf.template 为 livy.conf，添加如下内容

			
 
				+

			
 
				+   ```shell

			
 
				+   # 配置Livy会话所使用的Spark集群运行模式

			
 
				+   

			
 
				+   livy.server.host = 43.143.224.212

			
 
				+   livy.spark.master = yarn 

			
 
				+   # # 配置Livy会话所使用的Spark集群部署模式

			
 
				+   livy.spark.deploy-mode = cluster

			
 
				+   #livy.spark.deployMode = cluster

			
 
				+   # //默认使用hiveContext

			
 
				+   livy.repl.enableHiveContext = true

			
 
				+   # //开启用户代理

			
 
				+   livy.impersonation.enabled = true

			
 
				+   # //设置session空闲过期时间

			
 
				+   livy.server.session.timeout = 1h

			
 
				+   #

			
 
				+   livy.server.port = 8998

			
 
				+   ```

			
 
				+

			
 
				+3. 拷贝 livy-env.sh.template 为 livy-env.sh，添加如下内容

			
 
				+

			
 
				+   ```shell

			
 
				+   # 路径需要修改为自己的

			
 
				+   export SPARK_HOME=/opt/modules/spark-yarn

			
 
				+   export HADOOP_CONF_DIR=/opt/modules/hadoop-3.1.3/etc/hadoop

			
 
				+   ```

			
 
				+

			
 
				+4. 在安装目录下 创建log文件夹

			
 
				+

			
 
				+   ```shell

			
 
				+   mkdir /xxxxx/livy/logs

			
 
				+   ```

			
 
				+

			
 
				+5. 启动livy

			
 
				+

			
 
				+   ```shell

			
 
				+   # 进入到 livy 的安装路径 

			
 
				+   cd xxx

			
 
				+   bin/livy-server start

			
 
				+   

			
 
				+   # 关闭

			
 
				+   bin/livy-server stop

			
 
				+   ```

			
 
				+

			
 
				+   启动后 访问 http://ip:8998/ui

			
 
				+

			
 
				+   ![image-20220606180513212](图片/image-20220606180513212-165450991535315.png)

			
 
				+

			
 
				+6. 测试 是否安装 成功

			
 
				+

			
 
				+   a. 创建session

			
 
				+

			
 
				+   ```shell

			
 
				+    curl -XPOST 'http://10.168.57.10:8998/sessions' -H "Content-Type:application/json" --data '{"kind":"spark"}'

			
 
				+   ```

			
 
				+

			
 
				+   ![1646836917247](../document/学习内容+笔记/资料存放/关于项目/数据探查/数据探查安装文档/数据探查服务安装文档.assets/1646836917247.png)

			
 
				+

			
 
				+   注意：待到 livy server 的状态转换成idle的时候，向其发送请求，才会去执行。执行时，其状态转变成busy；执行完毕之后，状态又会变成idle。

			
 
				+

			
 
				+   b. 在当前文件下创建 hello.txt  并上传到 hdfs 中

			
 
				+

			
 
				+   ```shell

			
 
				+   vim hello.txt

			
 
				+   

			
 
				+   nihao spark

			
 
				+   nihao scala

			
 
				+   hello livy 

			
 
				+   

			
 
				+   hadoop fs -mkdir /livydemo

			
 
				+   hadoop fs -put ./hello.txt   /livydemo

			
 
				+   ```

			
 
				+

			
 
				+   ![1646837302747](../document/学习内容+笔记/资料存放/关于项目/数据探查/数据探查安装文档/数据探查服务安装文档.assets/1646837302747.png)

			
 
				+

			
 
				+   c. 提交任务

			
 
				+

			
 
				+   ```shell

			
 
				+   curl -XPOST 'http://10.168.57.10:8998/sessions/0/statements' -H 'Content-Type:application/json' -d '{"code":"sc.textFile(\"hdfs:///livy/data/word.txt\").flatMap(_.split(\" \")).map((_,1)).reduceByKey(_+_).saveAsTextFile(\"hdfs:///livy/result/1\")"}'

			
 
				+   ```

			
 
				+

			
 
				+   进入 yarn 查看运行情况   http://ip:8088/

			
 
				+

			
 
				+   ![image-20220606180638150](图片/image-20220606180638150-165451000036617.png)

			
 
				+

			
 
				+### 2.7 Elasticsearch部署

			
 
				+

			
 
				+1. 上传解压文件，并分发到其他节点上

			
 
				+

			
 
				+   ```shell

			
 
				+   # 解压缩

			
 
				+   tar -zxvf elasticsearch-5.0.0.tar.gz -C /opt/modules

			
 
				+   # 改名

			
 
				+   mv elasticsearch-5.0.0 es-cluster

			
 
				+   ```

			
 
				+

			
 
				+2. 修改 es/config/elasticsearch.yml  文件，分发文件

			
 
				+

			
 
				+   ```yaml

			
 
				+   cluster.name: es-cluster    #集群名，不同名称代表不同集群

			
 
				+   node.name: master    #节点名称，自定义

			
 
				+   path.data: /opt/modules/es-cluster/es/data

			
 
				+   

			
 
				+   path.logs: /opt/modules/es-cluster/es/logs    #日志路径

			
 
				+   bootstrap.memory_lock: false    #关闭锁内存

			
 
				+   network.host: 10.168.57.10     #绑定IP地址,如果IP有别名可以尝试别名，数字ip可能会报错

			
 
				+   http.port: 9200    #绑定端口

			
 
				+   discovery.zen.ping.unicast.hosts: ["10.168.57.10", "10.168.57.11", "10.168.57.12"]    #集群列表，类型数组，可以是IP或域名

			
 
				+   discovery.zen.minimum_master_nodes: 2    #节点数不能超过节点总数量（防止脑裂两台可配置成1，三台可配置2）

			
 
				+   http.cors.enabled: true    #开启http网络节点发现

			
 
				+   http.cors.allow-origin: "*"    #允许所有同网段节点发现

			
 
				+   ```

			
 
				+

			
 
				+3. 启动前要做的事

			
 
				+

			
 
				+   ```shell

			
 
				+   #建立es用户管理 es

			
 
				+   [root@master modules]# cd es-cluster/

			
 
				+   [root@master elasticsearch-5.0.0]# groupadd es

			
 
				+   [root@master elasticsearch-5.0.0]# useradd es -g es -p codingwhy

			
 
				+   [root@master elasticsearch-5.0.0]# chown -R es:es es-cluster

			
 
				+   chown: cannot access ‘elasticsearch-5.0.0’: No such file or directory

			
 
				+   [root@master elasticsearch-5.0.0]# cd ..

			
 
				+   [root@master modules]# chown -R es:es es-cluster

			
 
				+   

			
 
				+   ```

			
 
				+

			
 
				+   修改两个配置

			
 
				+

			
 
				+   第一个，Linux系统的soft、hard值配置过低，至少65536；第二个，Linux系统vm.max_map_count值配置过低，至少262144

			
 
				+

			
 
				+   ```shell

			
 
				+   vim /etc/security/limits.conf

			
 
				+   *               soft    nofile           65536

			
 
				+   *               hard    nofile           65536

			
 
				+   vim /etc/sysctl.conf

			
 
				+   #添加

			
 
				+   vm.max_map_count=262144

			
 
				+   #sysctl -p   生效

			
 
				+   [root@localhost es]# sysctl -p

			
 
				+   ```

			
 
				+

			
 
				+   

			
 
				+

			
 
				+1. 转成刚刚创建的用户再进入es的项目中

			
 
				+

			
 
				+   ```shell

			
 
				+   su es 

			
 
				+   ```

			
 
				+

			
 
				+   

			
 
				+

			
 
				+2. 启动 所有节点都要启动

			
 
				+

			
 
				+   ```shell

			
 
				+   #前台运行，但运行后不能进行其他操作

			
 
				+   elastic/bin/elasticsearch

			
 
				+   # 后台运行，前台不显示

			
 
				+   elastic/bin/elasticsearch -d

			
 
				+   ```

			
 
				+

			
 
				+3. 访问  http://每个节点ip:9200/

			
 
				+

			
 
				+![image-20220606182338871](图片/image-20220606182338871-165451102096019.png)

			
 
				+

			
 
				+

			
 
				+

			
 
				+7. 在Elasticsearch中创建griffin索引(在命令行中执行即可)

			
 
				+

			
 
				+   ```json

			
 
				+   curl -k -H "Content-Type: application/json" -X PUT http://192.168.100.157:9200/griffin \

			
 
				+    -d '{

			
 
				+       "aliases": {},

			
 
				+       "mappings": {

			
 
				+           "accuracy": {

			
 
				+               "properties": {

			
 
				+                   "name": {

			
 
				+                       "fields": {

			
 
				+                           "keyword": {

			
 
				+                               "ignore_above": 256,

			
 
				+                               "type": "keyword"

			
 
				+                           }

			
 
				+                       },

			
 
				+                       "type": "text"

			
 
				+                   },

			
 
				+                   "tmst": {

			
 
				+                       "type": "date"

			
 
				+                   }

			
 
				+               }

			
 
				+           }

			
 
				+       },

			
 
				+       "settings": {

			
 
				+           "index": {

			
 
				+               "number_of_replicas": "2",

			
 
				+               "number_of_shards": "5"

			
 
				+           }

			
 
				+       }

			
 
				+   }'

			
 
				+   ```

			
 
				+

			
 
				+   

			
 
				+

			
 
				+### 2.8 Griffin 的部署

			
 
				+

			
 
				+**说明：**在文件夹里总共有两个文件一个是 `measure-0.6.0.jar`和`service-0.6.0.tar.gz`,一般情况下只需要修改`service-0.6.0.tar.gz`里的相关配置，大多数是路径名和ip需要进行修改，现在把各个需要修改的文件内容放出来，便于以后的部署，这些需要修改的信息，全部都在config下，下面序号展示的目录是在源码中的位置，如果仅修改config的文件不能解决问题，则需要重新编译

			
 
				+

			
 
				+#### 1. 修改相关配置

			
 
				+

			
 
				+1. **service/src/main/resources/application.properties**

			
 
				+

			
 
				+   主要是修改相关ip、路径、账号密码

			
 
				+

			
 
				+   ```properties

			
 
				+   # Apache Griffin server port (default 8080)

			
 
				+   

			
 
				+   server.port = 8091

			
 
				+   spring.application.name=griffin_service

			
 
				+   spring.datasource.url=jdbc:mysql://10.168.57.10:3306/quartz?useSSL=false

			
 
				+   spring.datasource.username=root

			
 
				+   spring.datasource.password=root

			
 
				+   spring.jpa.generate-ddl=true

			
 
				+   spring.datasource.driver-class-name=com.mysql.jdbc.Driver

			
 
				+   spring.jpa.show-sql=true

			
 
				+   # Hive metastore

			
 
				+   hive.metastore.uris=thrift://10.168.57.10:9083

			
 
				+   hive.metastore.dbname=default

			
 
				+   hive.hmshandler.retry.attempts=15

			
 
				+   hive.hmshandler.retry.interval=2000ms

			
 
				+   #Hive jdbc

			
 
				+   hive.jdbc.className=org.apache.hive.jdbc.HiveDriver

			
 
				+   hive.jdbc.url=jdbc:hive2://10.168.57.10:10000/

			
 
				+   hive.need.kerberos=false

			
 
				+   hive.keytab.user=xxx@xx.com

			
 
				+   hive.keytab.path=/path/to/keytab/file

			
 
				+   # Hive cache time

			
 
				+   cache.evict.hive.fixedRate.in.milliseconds=900000

			
 
				+   # Kafka schema registry

			
 
				+   kafka.schema.registry.url=http://localhost:8081

			
 
				+   # Update job instance state at regular intervals

			
 
				+   jobInstance.fixedDelay.in.milliseconds=60000

			
 
				+   # Expired time of job instance which is 7 days that is 604800000 milliseconds.Time unit only supports milliseconds

			
 
				+   jobInstance.expired.milliseconds=604800000

			
 
				+   # schedule predicate job every 5 minutes and repeat 12 times at most

			
 
				+   # interval time unit s:second m:minute h:hour d:day,only support these four units

			
 
				+   predicate.job.interval=5m

			
 
				+   predicate.job.repeat.count=12

			
 
				+   # external properties directory location

			
 
				+   external.config.location=

			
 
				+   # external BATCH or STREAMING env

			
 
				+   external.env.location=

			
 
				+   # login strategy ("default" or "ldap")

			
 
				+   login.strategy=default

			
 
				+   # ldap

			
 
				+   ldap.url=ldap://hostname:port

			
 
				+   ldap.email=@example.com

			
 
				+   ldap.searchBase=DC=org,DC=example

			
 
				+   ldap.searchPattern=(sAMAccountName={0})

			
 
				+   # hdfs default name

			
 
				+   fs.defaultFS=

			
 
				+   # elasticsearch

			
 
				+   elasticsearch.host=10.168.57.10

			
 
				+   elasticsearch.port=9200

			
 
				+   elasticsearch.scheme=http

			
 
				+   # elasticsearch.user = user

			
 
				+   # elasticsearch.password = password

			
 
				+   # livy

			
 
				+   livy.uri=http://10.168.57.10:8998/batches

			
 
				+   livy.need.queue=false

			
 
				+   livy.task.max.concurrent.count=20

			
 
				+   livy.task.submit.interval.second=3

			
 
				+   livy.task.appId.retry.count=3

			
 
				+   livy.need.kerberos=false

			
 
				+   livy.server.auth.kerberos.principal=livy/kerberos.principal

			
 
				+   livy.server.auth.kerberos.keytab=/path/to/livy/keytab/file

			
 
				+   # yarn url

			
 
				+   yarn.uri=http://10.168.57.11:8088

			
 
				+   # griffin event listener

			
 
				+   internal.event.listeners=GriffinJobEventHook

			
 
				+   

			
 
				+   logging.file=logs/griffin-service.log

			
 
				+   ```

			
 
				+

			
 
				+2. **service/src/main/resources/quartz.properties**  

			
 
				+

			
 
				+   ```properties

			
 
				+   org.quartz.scheduler.instanceName=spring-boot-quartz

			
 
				+   org.quartz.scheduler.instanceId=AUTO

			
 
				+   org.quartz.threadPool.threadCount=5

			
 
				+   org.quartz.jobStore.class=org.quartz.impl.jdbcjobstore.JobStoreTX

			
 
				+   # If you use postgresql as your database,set this property value to org.quartz.impl.jdbcjobstore.PostgreSQLDelegate

			
 
				+   # If you use mysql as your database,set this property value to org.quartz.impl.jdbcjobstore.StdJDBCDelegate

			
 
				+   # If you use h2 as your database, it's ok to set this property value to StdJDBCDelegate, PostgreSQLDelegate or others

			
 
				+   #主要是这里要修改，如果用的mysql就是以下这个

			
 
				+   org.quartz.jobStore.driverDelegateClass=org.quartz.impl.jdbcjobstore.StdJDBCDelegate

			
 
				+   org.quartz.jobStore.useProperties=true

			
 
				+   org.quartz.jobStore.misfireThreshold=60000

			
 
				+   org.quartz.jobStore.tablePrefix=QRTZ_

			
 
				+   org.quartz.jobStore.isClustered=true

			
 
				+   org.quartz.jobStore.clusterCheckinInterval=20000

			
 
				+   ```

			
 
				+

			
 
				+3. **service/src/main/resources/sparkProperties.json**

			
 
				+

			
 
				+   这里主要是hdoop文件的相关路径，一般咱们路径就是这个不需要修改

			
 
				+

			
 
				+   ```json

			
 
				+   {

			
 
				+     "file": "hdfs:///griffin/griffin-measure.jar",

			
 
				+     "className": "org.apache.griffin.measure.Application",

			
 
				+     "queue": "default",

			
 
				+     "numExecutors": 2,

			
 
				+     "executorCores": 1,

			
 
				+     "driverMemory": "1g",

			
 
				+     "executorMemory": "1g",

			
 
				+     "conf": {

			
 
				+       "spark.yarn.dist.files": "hdfs:///home/spark_conf/hive-site.xml"

			
 
				+     },

			
 
				+     "files": [

			
 
				+     ]

			
 
				+   }

			
 
				+   ```

			
 
				+

			
 
				+4. **service/src/main/resources/env/env_batch.json**

			
 
				+

			
 
				+   ```json

			
 
				+   {

			
 
				+     "spark": {

			
 
				+       "log.level": "WARN"

			
 
				+     },

			
 
				+     "sinks": [

			
 
				+       {

			
 
				+         "name": "console",

			
 
				+         "type": "CONSOLE",

			
 
				+         "config": {

			
 
				+           "max.log.lines": 10

			
 
				+         }

			
 
				+       },

			
 
				+       {

			
 
				+         "name": "hdfs",

			
 
				+         "type": "HDFS",

			
 
				+         "config": {

			
 
				+           "path": "hdfs:///griffin/persist",

			
 
				+           "max.persist.lines": 10000,

			
 
				+           "max.lines.per.file": 10000

			
 
				+         }

			
 
				+       },

			
 
				+       {

			
 
				+         "name": "elasticsearch",

			
 
				+         "type": "ELASTICSEARCH",

			
 
				+         "config": {

			
 
				+           "method": "post",

			
 
				+           "api": "http://10.168.57.10:9200/griffin/accuracy",

			
 
				+           "connection.timeout": "1m",

			
 
				+           "retry": 10

			
 
				+         }

			
 
				+       }

			
 
				+     ],

			
 
				+     "griffin.checkpoint": []

			
 
				+   }

			
 
				+   ```

			
 
				+

			
 
				+5. **service/src/main/resources/env/env_streaming.json**

			
 
				+

			
 
				+   ```json

			
 
				+   {

			
 
				+     "spark": {

			
 
				+       "log.level": "WARN",

			
 
				+       "checkpoint.dir": "hdfs:///griffin/checkpoint/${JOB_NAME}",

			
 
				+       "init.clear": true,

			
 
				+       "batch.interval": "1m",

			
 
				+       "process.interval": "5m",

			
 
				+       "config": {

			
 
				+         "spark.default.parallelism": 4,

			
 
				+         "spark.task.maxFailures": 5,

			
 
				+         "spark.streaming.kafkaMaxRatePerPartition": 1000,

			
 
				+         "spark.streaming.concurrentJobs": 4,

			
 
				+         "spark.yarn.maxAppAttempts": 5,

			
 
				+         "spark.yarn.am.attemptFailuresValidityInterval": "1h",

			
 
				+         "spark.yarn.max.executor.failures": 120,

			
 
				+         "spark.yarn.executor.failuresValidityInterval": "1h",

			
 
				+         "spark.hadoop.fs.hdfs.impl.disable.cache": true

			
 
				+       }

			
 
				+     },

			
 
				+     "sinks": [

			
 
				+       {

			
 
				+         "type": "CONSOLE",

			
 
				+         "config": {

			
 
				+           "max.log.lines": 100

			
 
				+         }

			
 
				+       },

			
 
				+       {

			
 
				+         "type": "HDFS",

			
 
				+         "config": {

			
 
				+           "path": "hdfs:///griffin/persist",

			
 
				+           "max.persist.lines": 10000,

			
 
				+           "max.lines.per.file": 10000

			
 
				+         }

			
 
				+       },

			
 
				+       {

			
 
				+         "type": "ELASTICSEARCH",

			
 
				+         "config": {

			
 
				+           "method": "post",

			
 
				+           "api": "http://10.168.57.10:9200/griffin/accuracy"

			
 
				+         }

			
 
				+       }

			
 
				+     ],

			
 
				+     "griffin.checkpoint": [

			
 
				+       {

			
 
				+         "type": "zk",

			
 
				+         "config": {

			
 
				+           "hosts": "zk:2181",

			
 
				+           "namespace": "griffin/infocache",

			
 
				+           "lock.path": "lock",

			
 
				+           "mode": "persist",

			
 
				+           "init.clear": true,

			
 
				+           "close.clear": false

			
 
				+         }

			
 
				+       }

			
 
				+     ]

			
 
				+   }

			
 
				+   ```

			
 
				+

			
 
				+

			
 
				+

			
 
				+#### 2. 服务器中部署操作

			
 
				+

			
 
				+1. 上传measure的jar包到hadoop的文件夹中

			
 
				+

			
 
				+   ```shell

			
 
				+   # 修改jar包名称

			
 
				+   mv measure-0.6.0.jar griffin-measure.jar

			
 
				+   # 上传jar包到hadoop中griffin的目录夹下，如果没有该目录夹需要创建

			
 
				+   hdfs dfs -put griffin-measure.jar /griffin/

			
 
				+   

			
 
				+   #补充创建griffin文件夹的命令

			
 
				+   #hdfs dfs -mkdir /griffin

			
 
				+   ```

			
 
				+

			
 
				+2. 解压service文件并运行服务（在运行前一定要修改好相关配置）

			
 
				+

			
 
				+   ```shell

			
 
				+   #进入service目录文件下

			
 
				+   tar -zxvf target/service-0.6.0.tar.gz -C /opt/modules

			
 
				+   cd service-0.6.0

			
 
				+   # 服务开始

			
 
				+   ./bin/griffin.sh start  

			
 
				+   # or use ./bin/start.sh

			
 
				+   # 服务停止

			
 
				+   ./bin/griffin.sh stop

			
 
				+   # or use ./bin/stop.sh

			
 
				+   ```

			
 
				+

			
 
				+3. 访问界面

			
 
				+

			
 
				+   ```

			
 
				+   http://<your IP>:<your port>

			
 
				+   ```

			
 
				+

			
 
				+