update kafka consumer-01

2024-10-27 10:55:40 +08:00
parent 4726888819
commit eece6503c9
45 changed files with 1444 additions and 28 deletions
@@ -0,0 +1,15 @@
+sid1,2022-01-18,500
+sid1,2022-02-10,500
+sid1,2022-02-10,200
+sid1,2022-02-11,600
+sid1,2022-02-12,400
+sid1,2022-02-13,200
+sid1,2022-02-15,100
+sid1,2022-03-05,180
+sid1,2022-04-05,280
+sid1,2022-04-06,220
+sid2,2022-02-10,100
+sid2,2022-02-11,100
+sid2,2022-02-13,100
+sid2,2022-03-15,100
+sid2,2022-04-15,100
@@ -0,0 +1,12 @@
+id01,2022-02-28
+id01,2022-03-01
+id01,2022-03-01
+id01,2022-03-02
+id01,2022-03-05
+id01,2022-03-04
+id01,2022-03-06
+id01,2022-03-07
+id02,2022-03-01
+id02,2022-03-02
+id02,2022-03-03
+id02,2022-03-06
@@ -55,6 +55,13 @@
            <artifactId>hive-jdbc</artifactId>
            <version>3.1.2</version>
        </dependency>
+
+        <!-- Spark Streaming -->
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-streaming_2.12</artifactId>
+            <version>3.1.2</version> <!-- 选择你需要的版本 -->
+        </dependency>
    </dependencies>
    <build>
        <plugins>
@@ -1,4 +1,4 @@
-package com.aisi.spark
+package com.aisi.sparkSql

 import org.apache.spark.{SparkConf, SparkContext}

@@ -1,4 +1,4 @@
-package com.aisi.spark
+package com.aisi.sparkSql

 import org.apache.spark.SparkContext

@@ -1,4 +1,4 @@
-log4j.rootLogger=info,console
+log4j.rootLogger=error,console

 log4j.appender.console=org.apache.log4j.ConsoleAppender
 log4j.appender.console.target=System.out
@@ -0,0 +1,115 @@
+import java.io.{File, PrintWriter}
+import java.text.SimpleDateFormat
+import java.util.{Date, Properties, Random}
+import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
+
+object MockData {
+
+  def randomNum(index: Int, random: Random): String = {
+    var str = ""
+    for (i <- 0 until index) {
+      str += random.nextInt(10)
+    }
+    str
+  }
+
+  def fillZero(random: Random, num: Int, index: Int): String = {
+    val randomNum = random.nextInt(num)
+    var randomNumStr = randomNum.toString
+
+    if (randomNum < 10) {
+      randomNumStr = ("%0" + index + "d").format(randomNum)
+    }
+
+    randomNumStr
+  }
+
+  def initFile(path: String): PrintWriter = {
+    new PrintWriter(new File(path))
+  }
+
+  def writeDataToFile(pw: PrintWriter, content: String): Unit = {
+    pw.write(content + "\n")
+    pw.flush()
+  }
+
+  def closeFile(pw: PrintWriter): Unit = {
+    pw.close()
+  }
+
+  def initKafkaProducer(): KafkaProducer[String, String] = {
+    val props = new Properties()
+    props.put("bootstrap.servers", "localhost:9092")
+    props.put("acks", "all")
+    props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
+    props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
+
+    new KafkaProducer[String, String](props)
+  }
+
+  def writeDataToKafka(producer: KafkaProducer[String, String], content: String): Unit = {
+    producer.send(new ProducerRecord[String, String]("RoadRealTimeLog", content))
+  }
+
+  def closeKafka(producer: KafkaProducer[String, String]): Unit = {
+    producer.close()
+  }
+
+  def mock(): Unit = {
+    val pw = initFile("d:\\tmp\\data.txt")
+    val producer = initKafkaProducer()
+    val random = new Random()
+    val locations = Array("鲁", "京", "豫", "京", "沪", "赣", "津", "深", "黑", "粤")
+    val day = new SimpleDateFormat("yyyy-MM-dd").format(new Date())
+
+    for (i <- 0 until 30) {
+      val car = locations(random.nextInt(10)) + (65 + random.nextInt(26)).asInstanceOf[Char] + randomNum(5, random)
+      var baseActionTime = day + " " + fillZero(random, 24, 2)
+
+      for (j <- 0 until random.nextInt(300)) {
+
+        if (j % 30 == 0 && j != 0) {
+          var nextHour = ""
+          val baseHourParts = baseActionTime.split(" ")
+
+          if (baseHourParts.length > 1) {
+            val baseHour = baseHourParts(1)
+            if (baseHour.startsWith("0")) {
+              if (baseHour.endsWith("9")) {
+                nextHour = "10"
+              } else {
+                nextHour = "0" + (baseHour.substring(1).toInt + 1).toString
+              }
+            } else if (baseHour == "23") {
+              nextHour = fillZero(random, 24, 2)
+            } else {
+              nextHour = (baseHour.toInt + 1).toString
+            }
+            baseActionTime = day + " " + nextHour
+          } else {
+            baseActionTime = day + " 00" // 如果 baseActionTime 无法正确分割，默认使用 00 时
+          }
+        }
+
+        val actionTime = baseActionTime + ":" + fillZero(random, 60, 2) + ":" + fillZero(random, 60, 2)
+        val monitorId = fillZero(random, 10, 4)
+        val speed = random.nextInt(200) + 1
+        val roadId = random.nextInt(50) + 1
+        val cameraId = "0" + randomNum(4, random)
+        val areald = fillZero(random, random.nextInt(8) + 1, 2)
+
+        val content = day + "\t" + monitorId + "\t" + cameraId + "\t" + car + "\t" + actionTime + "\t" + speed + "\t" + roadId + "\t" + areald
+        writeDataToFile(pw, content)
+        writeDataToKafka(producer, content)
+        Thread.sleep(50)
+      }
+    }
+
+    closeFile(pw)
+    closeKafka(producer)
+  }
+
+  def main(args: Array[String]): Unit = {
+    mock()
+  }
+}
@@ -0,0 +1,51 @@
+package com.aisi.sparkSql;
+
+import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.sql.{DataFrame, SQLContext, SparkSession}
+import org.apache.spark.sql.expressions.Window
+import org.apache.spark.sql.functions._
+
+object A1 {
+  def main(args: Array[String]): Unit = {
+    val conf = new SparkConf()
+    conf.setMaster("local[*]")
+    conf.setAppName("user active")
+    val sc = new SparkContext(conf)
+    val sqlSc = new SQLContext(sc)
+    import sqlSc.implicits._
+
+    val userDF: DataFrame = sc.textFile("Spark/data/user.txt")
+      .map(t => {
+        val line = t.split(",")
+        val strDataTime = line(1).split("-")
+        val year = strDataTime(0)
+        val month = if (strDataTime(1).indexOf(0) == -1) strDataTime(1).substring(1) else strDataTime(1)
+        val day = if (strDataTime(2).indexOf(0) == -1) strDataTime(2).substring(1) else strDataTime(2)
+        (userRecord(line(0), year, month, day))
+      }).toDF()
+
+    userDF.show()
+
+    // 根据用户ID和月份进行分组
+    val groupedDF = userDF.groupBy("uid", "year", "month")
+      .agg(collect_list("day").as("days"))
+
+    // 展开 days 列，并转换为整型
+    val explodedDF = groupedDF
+      .withColumn("day", explode($"days"))
+      .withColumn("day", $"day".cast("int"))
+
+    // 定义窗口函数，按用户、年份、月份排序天数
+    val windowSpec = Window.partitionBy("uid", "year", "month").orderBy("day")
+
+    // 计算相邻天数之间的差值
+    val resultDF = explodedDF
+      .withColumn("prev_day", lag("day", 1).over(windowSpec))
+      .withColumn("day_diff", $"day" - $"prev_day")
+      .withColumn("is_active", when($"day_diff" === 1, 1).otherwise(0))
+
+    resultDF.show()
+  }
+}
+
+case class userRecord(uid: String, year: String, month: String, day: String)
@@ -0,0 +1,44 @@
+package com.aisi.sparkSql
+import org.apache.spark.rdd.RDD
+import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.sql.{DataFrame, RelationalGroupedDataset, SQLContext, SparkSession}
+
+import java.text.DateFormat
+import java.time.format.DateTimeFormatter
+
+/**
+ * 计算连续活跃用户的记录
+ */
+object A2 {
+  def main(args: Array[String]): Unit = {
+    val conf = new SparkConf()
+    conf.setMaster("local[*]")
+    conf.setAppName("shop count")
+    val sc = new SparkContext(conf)
+    val sqlSc = new SQLContext(sc)
+    import sqlSc.implicits._
+    // sid,dt,money
+    val userDF: DataFrame = sc.textFile("Spark/data/shops.txt")
+      .map(t => {
+        val line = t.split(",")
+        val sid = line(0)
+        val strDataTime = line(1).split("-")
+        val year = strDataTime(0)
+        val month = if (strDataTime(1).indexOf(0) == -1 ) strDataTime(1).substring(1) else  strDataTime(1)
+        val day = if (strDataTime(2).indexOf(0) == -1 ) strDataTime(2).substring(1) else  strDataTime(2)
+        val money = line(2).toInt
+        shopRecord(sid,year, month, day,money)
+      }).toDF()
+    //    userDF.show()
+    userDF.show()
+    // RelationalGroupedDataset: [grouping expressions: [sid: string, month: string], value: [sid: string, year: string ... 3 more fields], type: GroupBy]
+    val dataset = userDF.groupBy("sid","month")
+    println(dataset)
+    dataset.sum().show()
+
+    //    val sparkSession = SparkSession.builder().appName("user active").master("local[*]").getOrCreate()
+    //    userDF.groupBy("")
+    //    userDF.show()
+  }
+}
+case class shopRecord (sid:String, year:String,month:String,day:String,money:Int){}
@@ -0,0 +1,152 @@
+package com.aisi.sparkSql;
+
+import java.io.{File, PrintWriter}
+import java.text.SimpleDateFormat
+import java.util.{Date, Properties, Random}
+import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
+
+object MockData {
+
+  def randomNum(index: Int, random: Random): String = {
+
+    var str = ""
+
+    for (i <- 0 until index) {
+
+      str += random.nextInt(10)
+
+    }
+    str
+  }
+
+  def fillZero(random: Random, num: Int, index: Int): String = {
+
+    val randomNum = random.nextInt(num)
+
+
+    var randomNumStr = randomNum.toString
+
+    if (randomNum < 10) {
+
+      randomNumStr = ("%0" + index + "d").format(randomNum)
+
+    }
+
+    randomNumStr
+
+  }
+
+  def initFile(path: String): PrintWriter = {
+
+    new PrintWriter(new File(path))
+
+  }
+
+  def writeDataToFile(pw: PrintWriter, content: String): Unit = {
+    pw.write(content + "\n")
+
+    pw.flush()
+  }
+
+  def closeFile(pw: PrintWriter): Unit = {
+
+    pw.close()
+
+  }
+
+  def initKafkaProducer(): KafkaProducer[String,String] ={
+
+    val props = new Properties ()
+
+    props.put ("bootstrap.servers", "localhost:9092")
+
+    props.put ("acks", "all")
+
+    props.put ("key.serializer","org.apache.kafka.common.serialization.StringSerializer")
+
+    props.put ("value.serializer","org.apache.kafka.common.serialization.StringSerializer")
+
+    new KafkaProducer[String, String] (props)
+
+  }
+
+  def writeDataToKafka(producer:KafkaProducer[String,String],content:String):Unit = {
+
+    producer.send(new ProducerRecord[String,String]("RoadRealTimeLog",content))
+
+  }
+
+  def closeKafka(producer:KafkaProducer[String,String]):Unit = {
+
+    producer.close()
+  }
+
+  def mock(): Unit = {
+
+    val pw = initFile("路径")
+
+    val producer = initKafkaProducer()
+
+    val random = new Random()
+
+    val locations = Array("鲁","京","豫","京","沪","赣","津","深","黑","粤")
+
+    val day = new SimpleDateFormat ("yyyy-MM-dd").format (new Date())
+
+    for(i<-0 until 3000) {
+
+      val car = locations (random.nextInt (10)) + (65 + random.nextInt (26)).asInstanceOf[Char]+ randomNum(5, random)
+
+      var baseActionTime = day +""+ fillZero(random, 24,2)
+
+      for(j <- 0 until random.nextInt (300)) {
+
+        if (j % 30 == 0&j!=0) {
+          var nextHour = ""
+
+          val baseHour = baseActionTime.split(" ")(1)
+
+          if (baseHour.startsWith("0")) {
+
+            if (baseHour.endsWith("9")) {
+
+              nextHour = "10"
+
+            } else {
+
+              nextHour = "0" + (baseHour.substring(1).toInt + 1).toString
+            }
+          } else if (baseHour == "23") {
+            nextHour = fillZero(random, 24, 2)
+          } else {
+            nextHour = (baseHour.toInt + 1).toString
+          }
+          baseActionTime = day + " " + nextHour
+        }
+
+        val actionTime = baseActionTime + ":" + fillZero(random, 60, 2) + ":" + fillZero(random, 60, 2)
+
+        val monitorId = fillZero(random, 10, 4)
+
+        val speed = random.nextInt (200) +1
+
+        val roadId = random.nextInt (50)+1
+
+        val cameraId= "0"+ randomNum(4, random)
+
+        val areald = fillZero(random, random.nextInt(8) +1, 2)
+
+        val content = day + "\t"+ monitorId +"\t" + cameraId + "\t" + car + "\t" + actionTime + "\t" + speed + "\t" + roadId + "\t"+areald
+
+        writeDataToFile(pw, content)
+        writeDataToKafka(producer,content)
+        Thread.sleep(50)
+      }
+    }
+    closeFile(pw)
+    closeKafka(producer)
+  }
+  def main(args:Array[String]):Unit = {
+    mock()
+  }
+}
@@ -1,4 +1,4 @@
-package com.aisi.spark
+package com.aisi.sparkSql

 import org.apache.hive.jdbc.HiveDriver

@@ -1,4 +1,4 @@
-package com.aisi.spark
+package com.aisi.sparkSql

 import org.apache.spark.sql.expressions.Window
 import org.apache.spark.sql.{DataFrame, SQLContext}
@@ -1,4 +1,4 @@
-package com.aisi.spark
+package com.aisi.sparkSql

 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.expressions.Window
@@ -1,4 +1,4 @@
-package com.aisi.spark
+package com.aisi.sparkSql

 import org.apache.spark.sql.{DataFrame, SQLContext}
 import org.apache.spark.{SparkConf, SparkContext}
@@ -1,4 +1,4 @@
-package com.aisi.spark
+package com.aisi.sparkSql

 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.expressions.Window
@@ -1,4 +1,4 @@
-package com.aisi.spark
+package com.aisi.sparkSql

 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, SQLContext}
@@ -1,4 +1,4 @@
-package com.aisi.spark
+package com.aisi.sparkSql


 import org.apache.spark.sql.expressions.Aggregator
@@ -1,4 +1,4 @@
-package com.aisi.spark
+package com.aisi.sparkSql

 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, SparkSession}
@@ -1,4 +1,4 @@
-package com.aisi.spark
+package com.aisi.sparkSql

 import org.apache.spark.sql.SparkSession

@@ -0,0 +1,21 @@
+package com.aisi.sparkSreaming
+
+import org.apache.spark.SparkConf
+import org.apache.spark.streaming.{Seconds, StreamingContext}
+
+object TestStreaming {
+  def main(args: Array[String]): Unit = {
+    val conf = new SparkConf()
+    conf.setMaster("local[*]")
+    conf.setAppName("testStreaming")
+    val ssc = new StreamingContext(conf, Seconds(5))
+    val ds = ssc.socketTextStream("localhost", 6666)
+    val ds1 = ds.flatMap(_.split(" "))
+      .map((_, 1))
+      .reduceByKey(_ + _)
+    ds1.print()
+
+    ssc.start()
+    ssc.awaitTermination()
+  }
+}