commit

2024-10-11 11:12:32 +08:00
commit 8b4a30b940
30 changed files with 1005115 additions and 0 deletions
@@ -0,0 +1,39 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+  <parent>
+    <artifactId>HaiNiuProjects</artifactId>
+    <groupId>com.aisi</groupId>
+    <version>1.0-SNAPSHOT</version>
+  </parent>
+  <modelVersion>4.0.0</modelVersion>
+  <artifactId>MapReduceTest</artifactId>
+  <build>
+    <plugins>
+      <plugin>
+        <artifactId>maven-shade-plugin</artifactId>
+        <version>3.2.4</version>
+        <executions>
+          <execution>
+            <phase>package</phase>
+            <goals>
+              <goal>shade</goal>
+            </goals>
+          </execution>
+        </executions>
+        <configuration>
+          <archive>
+            <manifest>
+              <addDefaultImplementationEntries>true</addDefaultImplementationEntries>
+              <mainClass>com.aisi.wordcount.WordCountDriver</mainClass>
+            </manifest>
+          </archive>
+        </configuration>
+      </plugin>
+    </plugins>
+  </build>
+  <properties>
+    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    <maven.compiler.source>8</maven.compiler.source>
+    <maven.compiler.target>8</maven.compiler.target>
+  </properties>
+</project>
@@ -0,0 +1,95 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+    <parent>
+        <groupId>com.aisi</groupId>
+        <artifactId>HaiNiuProjects</artifactId>
+        <version>1.0-SNAPSHOT</version>
+    </parent>
+
+    <artifactId>MapReduceTest</artifactId>
+
+    <properties>
+        <maven.compiler.source>8</maven.compiler.source>
+        <maven.compiler.target>8</maven.compiler.target>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.hadoop</groupId>
+            <artifactId>hadoop-client</artifactId>
+            <version>3.1.4</version>
+        </dependency>
+        <dependency>
+            <groupId>org.slf4j</groupId>
+            <artifactId>slf4j-simple</artifactId>
+            <version>1.7.32</version>
+        </dependency>
+        <dependency>
+            <groupId>log4j</groupId>
+            <artifactId>log4j</artifactId>
+            <version>1.2.17</version>
+        </dependency>
+
+    </dependencies>
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-shade-plugin</artifactId>
+                <version>3.2.4</version>
+                <executions>
+                    <execution>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>shade</goal>
+                        </goals>
+                        <configuration>
+                            <createDependencyReducedPom>false</createDependencyReducedPom>
+                            <filters>
+                                <filter>
+                                    <artifact>*:*</artifact>
+                                    <excludes>
+                                        <exclude>META-INF/*.SF</exclude>
+                                        <exclude>META-INF/*.DSA</exclude>
+                                        <exclude>META-INF/*.RSA</exclude>
+                                    </excludes>
+                                </filter>
+                            </filters>
+                            <shadedArtifactAttached>true</shadedArtifactAttached>
+                            <shadedArtifactPrimary>true</shadedArtifactPrimary>
+                            <outputFile>${project.build.directory}/${project.build.finalName}-shaded.jar</outputFile>
+                            <relocators>
+                                <relocator>
+                                    <pattern>org.apache.commons</pattern>
+                                    <shadedPattern>shade.org.apache.commons</shadedPattern>
+                                </relocator>
+                            </relocators>
+                            <filters>
+                                <filter>
+                                    <artifact>*:*</artifact>
+                                    <excludes>
+                                        <exclude>META-INF/*.SF</exclude>
+                                        <exclude>META-INF/*.DSA</exclude>
+                                        <exclude>META-INF/*.RSA</exclude>
+                                    </excludes>
+                                </filter>
+                            </filters>
+                            <transformers>
+                                <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
+                                <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+                                    <mainClass>com.aisi.wordcount.WordCountDriver</mainClass>
+                                </transformer>
+                            </transformers>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+    </build>
+
+
+</project>
@@ -0,0 +1,45 @@
+package com.aisi.wordcount;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+
+import java.io.IOException;
+
+public class WordCountDriver {
+    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
+        // 创建配置文件对象
+        Configuration conf = new Configuration();
+//        conf.set("fs.defaultFS", "hdfs://localhost:9000/");
+        // 创建任务对象
+        Job job = Job.getInstance(conf, "wordcount");
+        // 设置入口类
+        job.setJarByClass(WordCountDriver.class);
+        // 设置mapper类
+        job.setMapperClass(WordCountMapper.class);
+        // 设置reducer类
+        job.setReducerClass(WordCountReducer.class);
+        // 设置reducer输出类型
+        job.setOutputKeyClass(Text.class);
+        job.setOutputValueClass(IntWritable.class);
+        // 设置mapper输出类型
+        job.setMapOutputKeyClass(Text.class);
+        job.setMapOutputValueClass(IntWritable.class);
+        // 设置mapreduce要处理的文件路径(hdfs路径)
+        FileInputFormat.addInputPath(job, new Path(args[0]));//"hdfs://ns1/word/words.txt"
+        // 设置mapreduce处理完成保存的文件路径
+        FileOutputFormat.setOutputPath(job, new Path(args[1])); //"hdfs://ns1/word/result"
+        boolean completion = job.waitForCompletion(true);
+        // 判断是否运行成功
+        if (completion) {
+            System.exit(0);
+        }else {
+            System.exit(1);
+        }
+    }
+}
@@ -0,0 +1,27 @@
+package com.aisi.wordcount;
+
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+
+import java.io.IOException;
+
+public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
+    @Override
+    /**
+     * key : 行首偏移量
+     * value：一行的数据
+     */
+    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
+        System.out.println("map invoke...");
+        String[] words = value.toString().split(" ");
+        for (String word : words) {
+            context.write(new Text(word), new IntWritable(1));
+//            (hello,1） （pooo，1） （shenjianz，1）
+        }
+    }
+
+
+
+}
@@ -0,0 +1,25 @@
+package com.aisi.wordcount;
+
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Reducer;
+
+
+import java.io.IOException;
+
+public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
+
+    //            (hello,1） （pooo，1） （shenjianz，1）  (hello,1)
+    //  (hello,[1,1])
+    @Override
+    protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
+        System.out.println("reduce invoke...");
+        // 记录每个单词的总数
+        int sum = 0;
+        for (IntWritable value : values) {
+            sum += value.get();
+        }
+        context.write(key, new IntWritable(sum));
+        // (hello,2)
+    }
+}
@@ -0,0 +1,36 @@
+<configuration>
+    <property>
+        <name>fs.defaultFS</name>
+        <value>hdfs://ns1</value>
+        <description>默认文件服务的协议和NS逻辑名称，和hdfs-site.xml里的对应此配置替代了1.0里的fs.default.name</description>
+    </property>
+
+    <property>
+        <name>hadoop.tmp.dir</name>
+        <value>/data/tmp</value>
+        <description>数据存储目录</description>
+    </property>
+
+    <property>
+        <name>hadoop.proxyuser.root.groups</name>
+        <value>hadoop</value>
+        <description>
+            hdfs dfsadmin –refreshSuperUserGroupsConfiguration,
+            yarn rmadmin –refreshSuperUserGroupsConfiguration
+            使用这两个命令不用重启就能刷新
+        </description>
+    </property>
+
+    <property>
+        <name>hadoop.proxyuser.root.hosts</name>
+        <value>localhost</value>
+        <description>本地代理</description>
+    </property>
+
+    <!-- zkfc的配置 -->
+    <property>
+        <name>ha.zookeeper.quorum</name>
+        <value>nn1:2181,nn2:2181,nn3:2181</value>
+        <description>HA使用的zookeeper地址</description>
+    </property>
+</configuration>
@@ -0,0 +1,140 @@
+<configuration>
+    <property>
+        <name>dfs.namenode.name.dir</name>
+        <value>/data/namenode</value>
+        <description>namenode本地文件存放地址</description>
+    </property>
+
+    <property>
+        <name>dfs.nameservices</name>
+        <value>ns1</value>
+        <description>提供服务的NS逻辑名称，与core-site.xml里的对应</description>
+    </property>
+
+    <!-- namenode的配置 -->
+    <!-- 主要的 -->
+    <property>
+        <name>dfs.ha.namenodes.ns1</name>
+        <value>nn1,nn2,nn3</value>
+        <description>列出该逻辑名称下的NameNode逻辑名称</description>
+    </property>
+
+    <property>
+        <name>dfs.namenode.rpc-address.ns1.nn1</name>
+        <value>nn1:9000</value>
+        <description>指定NameNode的RPC位置</description>
+    </property>
+
+    <property>
+        <name>dfs.namenode.http-address.ns1.nn1</name>
+        <value>nn1:50070</value>
+        <description>指定NameNode的Web Server位置</description>
+    </property>
+
+    <property>
+        <name>dfs.namenode.rpc-address.ns1.nn2</name>
+        <value>nn2:9000</value>
+        <description>指定NameNode的RPC位置</description>
+    </property>
+
+    <property>
+        <name>dfs.namenode.http-address.ns1.nn2</name>
+        <value>nn2:50070</value>
+        <description>指定NameNode的Web Server位置</description>
+    </property>
+
+    <property>
+        <name>dfs.namenode.rpc-address.ns1.nn3</name>
+        <value>nn3:9000</value>
+        <description>指定NameNode的RPC位置</description>
+    </property>
+
+    <property>
+        <name>dfs.namenode.http-address.ns1.nn3</name>
+        <value>nn3:50070</value>
+        <description>指定NameNode的Web Server位置</description>
+    </property>
+
+    <property>
+        <name>dfs.namenode.handler.count</name>
+        <value>77</value>
+        <description>namenode的工作线程数</description>
+    </property>
+
+    <!-- journaldata配置,使得其他两个namenode同步第一个namenode数据 -->
+    <property>
+        <name>dfs.namenode.shared.edits.dir</name>
+        <value>qjournal://nn1:8485;nn2:8485;nn3:8485/ns1</value>
+        <description>指定用于HA存放edits的共享存储，通常是namenode的所在机器</description>
+    </property>
+
+    <property>
+        <name>dfs.journalnode.edits.dir</name>
+        <value>/data/journaldata/</value>
+        <description>journaldata服务存放文件的地址</description>
+    </property>
+
+    <property>
+        <name>ipc.client.connect.max.retries</name>
+        <value>10</value>
+        <description>namenode和journalnode的链接重试次数10次</description>
+    </property>
+
+    <property>
+        <name>ipc.client.connect.retry.interval</name>
+        <value>10000</value>
+        <description>重试的间隔时间10s</description>
+    </property>
+
+    <!-- zkfc的配置 -->
+    <property>
+        <name>dfs.ha.fencing.methods</name>
+        <value>sshfence</value>
+        <description>指定HA做隔离的方法，缺省是ssh，可设为shell，稍后详述</description>
+    </property>
+
+    <property>
+        <name>dfs.ha.fencing.ssh.private-key-files</name>
+        <value>/home/hadoop/.ssh/id_rsa</value>
+        <description>杀死命令脚本的免密配置秘钥</description>
+    </property>
+
+    <property>
+        <name>dfs.client.failover.proxy.provider.ns1</name>
+        <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
+        <description>指定客户端用于HA切换的代理类，不同的NS可以用不同的代理类以上示例为Hadoop 2.0自带的缺省代理类</description>
+    </property>
+
+    <property>
+        <name>dfs.client.failover.proxy.provider.auto-ha</name>
+        <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
+    </property>
+
+    <property>
+        <name>dfs.ha.automatic-failover.enabled</name>
+        <value>true</value>
+    </property>
+    <!-- datanode配置 -->
+    <property>
+        <name>dfs.datanode.data.dir</name>
+        <value>/data/datanode</value>
+        <description>datanode本地文件存放地址</description>
+    </property>
+    <property>
+        <name>dfs.replication</name>
+        <value>3</value>
+        <description>文件复本数</description>
+    </property>
+    <property>
+        <name>dfs.namenode.datanode.registration.ip-hostname-check</name>
+        <value>false</value>
+    </property>
+    <property>
+        <name>dfs.client.use.datanode.hostname</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>dfs.datanode.use.datanode.hostname</name>
+        <value>true</value>
+    </property>
+</configuration>