update kafka consumer-01

2024-10-27 10:55:40 +08:00
parent 4726888819
commit eece6503c9
45 changed files with 1444 additions and 28 deletions
@@ -0,0 +1,10 @@
+2024-10-01 www.example.com
+2024-10-01 www.example.com
+2024-10-01 www.test.com
+2024-10-02 www.example.com
+2024-10-02 www.test.com
+2024-10-02 www.sample.com
+2024-10-03 www.example.com
+2024-10-03 www.test.com
+2024-10-03 www.sample.com
+2024-10-03 www.example.com
@@ -33,6 +33,14 @@
            <artifactId>log4j</artifactId>
            <version>1.2.17</version>
        </dependency>
+        <dependency>
+            <groupId>org.junit.jupiter</groupId>
+            <artifactId>junit-jupiter</artifactId>
+            <version>5.9.3</version> <!-- 使用最新的稳定版本 -->
+            <scope>test</scope>
+        </dependency>
+
+

    </dependencies>
    <build>
@@ -60,28 +68,28 @@
                                </filter>
                            </filters>
                            <shadedArtifactAttached>true</shadedArtifactAttached>
-                            <shadedArtifactPrimary>true</shadedArtifactPrimary>
+<!--                            <shadedArtifactPrimary>true</shadedArtifactPrimary>-->
                            <outputFile>${project.build.directory}/${project.build.finalName}-shaded.jar</outputFile>
-                            <relocators>
-                                <relocator>
-                                    <pattern>org.apache.commons</pattern>
-                                    <shadedPattern>shade.org.apache.commons</shadedPattern>
-                                </relocator>
-                            </relocators>
+<!--                            <relocators>-->
+<!--                                <relocator>-->
+<!--                                    <pattern>org.apache.commons</pattern>-->
+<!--                                    <shadedPattern>shade.org.apache.commons</shadedPattern>-->
+<!--                                </relocator>-->
+<!--                            </relocators>-->
                            <filters>
-                                <filter>
-                                    <artifact>*:*</artifact>
-                                    <excludes>
-                                        <exclude>META-INF/*.SF</exclude>
-                                        <exclude>META-INF/*.DSA</exclude>
-                                        <exclude>META-INF/*.RSA</exclude>
-                                    </excludes>
-                                </filter>
+<!--                                <filter>-->
+<!--                                    <artifact>*:*</artifact>-->
+<!--                                    <excludes>-->
+<!--                                        <exclude>META-INF/*.SF</exclude>-->
+<!--                                        <exclude>META-INF/*.DSA</exclude>-->
+<!--                                        <exclude>META-INF/*.RSA</exclude>-->
+<!--                                    </excludes>-->
+<!--                                </filter>-->
                            </filters>
                            <transformers>
                                <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
                                <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
-                                    <mainClass>com.aisi.wordcount.WordCountDriver</mainClass>
+                                    <mainClass>com.aisi.accesscount.VisitCountDriver</mainClass>
                                </transformer>
                            </transformers>
                        </configuration>
@@ -0,0 +1,22 @@
+package com.aisi.accesscount;
+
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+
+import java.io.IOException;
+
+public class SortMapper extends Mapper<LongWritable, Text, IntWritable, Text> {
+    @Override
+    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
+        // 数据格式：日期 总访问次数
+        String[] fields = value.toString().split("\t");
+        if (fields.length == 2) {
+            String date = fields[0];
+            int count = Integer.parseInt(fields[1]);
+            // 以访问次数作为 key，日期作为 value
+            context.write(new IntWritable(count), new Text(date));
+        }
+    }
+}
@@ -0,0 +1,16 @@
+package com.aisi.accesscount;
+
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Reducer;
+
+import java.io.IOException;
+
+public class SortReducer extends Reducer<IntWritable, Text, Text, IntWritable> {
+    @Override
+    protected void reduce(IntWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
+        for (Text date : values) {
+            context.write(date, key);
+        }
+    }
+}
@@ -0,0 +1,51 @@
+package com.aisi.accesscount;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+
+public class VisitCountDriver {
+    public static void main(String[] args) throws Exception {
+        Configuration conf = new Configuration();
+
+        // 第一个 Job：统计每日访问次数
+        Job countJob = Job.getInstance(conf, "Visit Count");
+        countJob.setJarByClass(VisitCountDriver.class);
+        countJob.setMapperClass(VisitCountMapper.class);
+        countJob.setReducerClass(VisitCountReducer.class);
+
+        countJob.setMapOutputKeyClass(Text.class);
+        countJob.setMapOutputValueClass(IntWritable.class);
+        countJob.setOutputKeyClass(Text.class);
+        countJob.setOutputValueClass(IntWritable.class);
+
+        FileInputFormat.addInputPath(countJob, new Path(args[0]));
+        Path tempOutput = new Path("temp_output");
+        FileOutputFormat.setOutputPath(countJob, tempOutput);
+
+        boolean countJobSuccess = countJob.waitForCompletion(true);
+        if (!countJobSuccess) {
+            System.exit(1);
+        }
+
+        // 第二个 Job：将访问次数进行升序排序
+        Job sortJob = Job.getInstance(conf, "Sort Visits");
+        sortJob.setJarByClass(VisitCountDriver.class);
+        sortJob.setMapperClass(SortMapper.class);
+        sortJob.setReducerClass(SortReducer.class);
+
+        sortJob.setMapOutputKeyClass(IntWritable.class);
+        sortJob.setMapOutputValueClass(Text.class);
+        sortJob.setOutputKeyClass(Text.class);
+        sortJob.setOutputValueClass(IntWritable.class);
+
+        FileInputFormat.addInputPath(sortJob, tempOutput);
+        FileOutputFormat.setOutputPath(sortJob, new Path(args[1]));
+
+        System.exit(sortJob.waitForCompletion(true) ? 0 : 1);
+    }
+}
@@ -0,0 +1,20 @@
+package com.aisi.accesscount;
+
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+
+import java.io.IOException;
+
+public class VisitCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
+    @Override
+    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
+        // 解析输入的每一行数据，假设格式为 "日期 URL"
+        String[] fields = value.toString().split(" ");
+        if (fields.length == 2) {
+            String date = fields[0];
+            context.write(new Text(date), new IntWritable(1));
+        }
+    }
+}
@@ -0,0 +1,18 @@
+package com.aisi.accesscount;
+
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Reducer;
+
+import java.io.IOException;
+
+public class VisitCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
+    @Override
+    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
+        int sum = 0;
+        for (IntWritable value : values) {
+            sum += value.get();
+        }
+        context.write(key, new IntWritable(sum));
+    }
+}
@@ -0,0 +1,7 @@
+package com.aisi.api;
+
+
+
+public class Test {
+
+}
@@ -0,0 +1,107 @@
+import org.apache.commons.compress.utils.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import java.io.*;
+
+public class Example {
+    static FileSystem fs = null;
+    @BeforeAll
+    public static void setup() throws IOException {
+        Configuration conf = new Configuration();
+        fs = FileSystem.get(conf);
+    }
+    @AfterAll
+    public static void teardown() throws IOException {
+        if (fs != null) {
+            fs.close();
+        }
+    }
+    @Test
+    public void list() throws IOException {
+        FileStatus[] fileStatuses = fs.listStatus(new Path("/"));
+        for (FileStatus fileStatus : fileStatuses) {
+            System.out.println(fileStatus.getPath());
+        }
+    }
+
+    @Test
+    public void mkdir() throws IOException {
+        boolean mkdirsed = fs.mkdirs(new Path("/test"));
+        if (mkdirsed) {
+            System.out.println("mkdirsed");
+        }else
+            System.out.println("mkdir failed");
+    }
+
+    @Test
+    public void delete() throws IOException {
+        boolean deleted = fs.delete(new Path("/test"), true);
+        if (deleted) {
+            System.out.println("delete");
+        }else
+            System.out.println("delete failed");
+    }
+
+    @Test
+    public void upload() throws IOException {
+        fs.copyFromLocalFile(new Path("d:\\tmp\\process.xml"), new Path("/test/process.xml"));
+        System.out.println("upload success");
+    }
+
+    @Test
+    public void download() throws IOException {
+        fs.copyToLocalFile(new Path("/test/process.xml"), new Path("d:\\tmp\\process_download.xml"));
+        System.out.println("download success");
+    }
+
+    @Test
+    public void read() throws IOException {
+        FSDataInputStream fsDataInputStream = fs.open(new Path("/test/process.xml"));
+        new BufferedReader(new InputStreamReader(fsDataInputStream)).lines().forEach(System.out::println);
+        fsDataInputStream.close();
+    }
+
+    @Test
+    public void write() throws IOException {
+        FSDataOutputStream fsDataOutputStream = fs.create(new Path("/test/process_replication.xml"));
+        FSDataInputStream fsDataInputStream = fs.open(new Path("/test/process.xml"));
+        BufferedReader reader = new BufferedReader(new InputStreamReader(fsDataInputStream, "utf-8"));
+        BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, "utf-8"));
+        String line = "";
+        while ((line = reader.readLine()) != null) {
+            writer.write(line);
+            writer.newLine();
+        }
+        writer.close();
+        reader.close();
+        fsDataOutputStream.close();
+        fsDataInputStream.close();
+    }
+    
+    @Test
+    public void read1() throws IOException {
+        Configuration conf=new Configuration();
+        //获取SequenceFile.Reader对象
+        SequenceFile.Reader reader=new SequenceFile.Reader(fs,new Path("/example/part-m-00000"),conf);
+        //获取序列化中使用的键和值类型
+        Text key=new Text();
+        Text value=new Text();
+        //将读取的数据写入janfeb.txt文件
+        BufferedWriter out=new BufferedWriter(new OutputStreamWriter(new FileOutputStream("D:\\tmp\\5-12.txt")));
+        while(reader.next(key,value)){
+            out.write(key.toString()+"\t"+value.toString()+"\r\n");
+        }
+        out.close();
+        reader.close();
+    }
+
+
+
+}
@@ -0,0 +1,56 @@
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.GenericOptionsParser;
+import org.apache.hadoop.conf.Configuration;
+
+//代码5-3
+import java.io.IOException;
+
+public class SelectData {
+    public static class MyMap extends Mapper<Object, Text, Text, Text> {
+        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
+            String line = value.toString();
+            String arr[] = line.split(",");
+            if (arr[4].contains("2021/1") || arr[4].contains("2021/2")) {
+                context.write(new Text(arr[2]),
+                        new Text(arr[4].substring(0, 9)));
+            }
+
+
+        }
+    }
+
+    public static void main(String[] args) throws Exception {
+        Configuration conf = new Configuration();
+        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
+        if(otherArgs.length<2){
+            System.err.println("必须输入读取文件路径和输出路径");
+            System.exit(2);
+        }
+        Job job =Job.getInstance(conf,"Select Data");
+        job.setJarByClass(SelectData.class);
+        job.setMapperClass(MyMap.class);
+        job.setOutputKeyClass(Text.class);
+        job.setOutputValueClass(Text.class);
+
+        //设置输入格式
+        job.setInputFormatClass(TextInputFormat.class);
+        //设置输出格式
+        job.setOutputFormatClass(SequenceFileOutputFormat.class);
+        //设置reduce的任务数是0
+        job.setNumReduceTasks(0);
+        for(int i=0;i<otherArgs.length-1;++i){
+            FileInputFormat.addInputPath(job,new Path(otherArgs[i]));
+        }
+        FileOutputFormat.setOutputPath(job,new Path(otherArgs[otherArgs.length-1]));
+        System.exit(job.waitForCompletion(true)?0:1);
+    }
+}
+