update kafka consumer-01

This commit is contained in:
2024-10-27 10:55:40 +08:00
parent 4726888819
commit eece6503c9
45 changed files with 1444 additions and 28 deletions

View File

@@ -1,4 +1,4 @@
package com.aisi.spark
package com.aisi.sparkSql
import org.apache.spark.{SparkConf, SparkContext}

View File

@@ -1,4 +1,4 @@
package com.aisi.spark
package com.aisi.sparkSql
import org.apache.spark.SparkContext

View File

@@ -1,4 +1,4 @@
log4j.rootLogger=info,console
log4j.rootLogger=error,console
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.target=System.out

View File

@@ -0,0 +1,115 @@
import java.io.{File, PrintWriter}
import java.text.SimpleDateFormat
import java.util.{Date, Properties, Random}
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
object MockData {
def randomNum(index: Int, random: Random): String = {
var str = ""
for (i <- 0 until index) {
str += random.nextInt(10)
}
str
}
def fillZero(random: Random, num: Int, index: Int): String = {
val randomNum = random.nextInt(num)
var randomNumStr = randomNum.toString
if (randomNum < 10) {
randomNumStr = ("%0" + index + "d").format(randomNum)
}
randomNumStr
}
def initFile(path: String): PrintWriter = {
new PrintWriter(new File(path))
}
def writeDataToFile(pw: PrintWriter, content: String): Unit = {
pw.write(content + "\n")
pw.flush()
}
def closeFile(pw: PrintWriter): Unit = {
pw.close()
}
def initKafkaProducer(): KafkaProducer[String, String] = {
val props = new Properties()
props.put("bootstrap.servers", "localhost:9092")
props.put("acks", "all")
props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
new KafkaProducer[String, String](props)
}
def writeDataToKafka(producer: KafkaProducer[String, String], content: String): Unit = {
producer.send(new ProducerRecord[String, String]("RoadRealTimeLog", content))
}
def closeKafka(producer: KafkaProducer[String, String]): Unit = {
producer.close()
}
def mock(): Unit = {
val pw = initFile("d:\\tmp\\data.txt")
val producer = initKafkaProducer()
val random = new Random()
val locations = Array("鲁", "京", "豫", "京", "沪", "赣", "津", "深", "黑", "粤")
val day = new SimpleDateFormat("yyyy-MM-dd").format(new Date())
for (i <- 0 until 30) {
val car = locations(random.nextInt(10)) + (65 + random.nextInt(26)).asInstanceOf[Char] + randomNum(5, random)
var baseActionTime = day + " " + fillZero(random, 24, 2)
for (j <- 0 until random.nextInt(300)) {
if (j % 30 == 0 && j != 0) {
var nextHour = ""
val baseHourParts = baseActionTime.split(" ")
if (baseHourParts.length > 1) {
val baseHour = baseHourParts(1)
if (baseHour.startsWith("0")) {
if (baseHour.endsWith("9")) {
nextHour = "10"
} else {
nextHour = "0" + (baseHour.substring(1).toInt + 1).toString
}
} else if (baseHour == "23") {
nextHour = fillZero(random, 24, 2)
} else {
nextHour = (baseHour.toInt + 1).toString
}
baseActionTime = day + " " + nextHour
} else {
baseActionTime = day + " 00" // 如果 baseActionTime 无法正确分割,默认使用 00 时
}
}
val actionTime = baseActionTime + ":" + fillZero(random, 60, 2) + ":" + fillZero(random, 60, 2)
val monitorId = fillZero(random, 10, 4)
val speed = random.nextInt(200) + 1
val roadId = random.nextInt(50) + 1
val cameraId = "0" + randomNum(4, random)
val areald = fillZero(random, random.nextInt(8) + 1, 2)
val content = day + "\t" + monitorId + "\t" + cameraId + "\t" + car + "\t" + actionTime + "\t" + speed + "\t" + roadId + "\t" + areald
writeDataToFile(pw, content)
writeDataToKafka(producer, content)
Thread.sleep(50)
}
}
closeFile(pw)
closeKafka(producer)
}
def main(args: Array[String]): Unit = {
mock()
}
}

View File

@@ -0,0 +1,51 @@
package com.aisi.sparkSql;
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{DataFrame, SQLContext, SparkSession}
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
object A1 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local[*]")
conf.setAppName("user active")
val sc = new SparkContext(conf)
val sqlSc = new SQLContext(sc)
import sqlSc.implicits._
val userDF: DataFrame = sc.textFile("Spark/data/user.txt")
.map(t => {
val line = t.split(",")
val strDataTime = line(1).split("-")
val year = strDataTime(0)
val month = if (strDataTime(1).indexOf(0) == -1) strDataTime(1).substring(1) else strDataTime(1)
val day = if (strDataTime(2).indexOf(0) == -1) strDataTime(2).substring(1) else strDataTime(2)
(userRecord(line(0), year, month, day))
}).toDF()
userDF.show()
// 根据用户ID和月份进行分组
val groupedDF = userDF.groupBy("uid", "year", "month")
.agg(collect_list("day").as("days"))
// 展开 days 列,并转换为整型
val explodedDF = groupedDF
.withColumn("day", explode($"days"))
.withColumn("day", $"day".cast("int"))
// 定义窗口函数,按用户、年份、月份排序天数
val windowSpec = Window.partitionBy("uid", "year", "month").orderBy("day")
// 计算相邻天数之间的差值
val resultDF = explodedDF
.withColumn("prev_day", lag("day", 1).over(windowSpec))
.withColumn("day_diff", $"day" - $"prev_day")
.withColumn("is_active", when($"day_diff" === 1, 1).otherwise(0))
resultDF.show()
}
}
case class userRecord(uid: String, year: String, month: String, day: String)

View File

@@ -0,0 +1,44 @@
package com.aisi.sparkSql
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{DataFrame, RelationalGroupedDataset, SQLContext, SparkSession}
import java.text.DateFormat
import java.time.format.DateTimeFormatter
/**
* 计算连续活跃用户的记录
*/
object A2 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local[*]")
conf.setAppName("shop count")
val sc = new SparkContext(conf)
val sqlSc = new SQLContext(sc)
import sqlSc.implicits._
// sid,dt,money
val userDF: DataFrame = sc.textFile("Spark/data/shops.txt")
.map(t => {
val line = t.split(",")
val sid = line(0)
val strDataTime = line(1).split("-")
val year = strDataTime(0)
val month = if (strDataTime(1).indexOf(0) == -1 ) strDataTime(1).substring(1) else strDataTime(1)
val day = if (strDataTime(2).indexOf(0) == -1 ) strDataTime(2).substring(1) else strDataTime(2)
val money = line(2).toInt
shopRecord(sid,year, month, day,money)
}).toDF()
// userDF.show()
userDF.show()
// RelationalGroupedDataset: [grouping expressions: [sid: string, month: string], value: [sid: string, year: string ... 3 more fields], type: GroupBy]
val dataset = userDF.groupBy("sid","month")
println(dataset)
dataset.sum().show()
// val sparkSession = SparkSession.builder().appName("user active").master("local[*]").getOrCreate()
// userDF.groupBy("")
// userDF.show()
}
}
case class shopRecord (sid:String, year:String,month:String,day:String,money:Int){}

View File

@@ -0,0 +1,152 @@
package com.aisi.sparkSql;
import java.io.{File, PrintWriter}
import java.text.SimpleDateFormat
import java.util.{Date, Properties, Random}
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
object MockData {
def randomNum(index: Int, random: Random): String = {
var str = ""
for (i <- 0 until index) {
str += random.nextInt(10)
}
str
}
def fillZero(random: Random, num: Int, index: Int): String = {
val randomNum = random.nextInt(num)
var randomNumStr = randomNum.toString
if (randomNum < 10) {
randomNumStr = ("%0" + index + "d").format(randomNum)
}
randomNumStr
}
def initFile(path: String): PrintWriter = {
new PrintWriter(new File(path))
}
def writeDataToFile(pw: PrintWriter, content: String): Unit = {
pw.write(content + "\n")
pw.flush()
}
def closeFile(pw: PrintWriter): Unit = {
pw.close()
}
def initKafkaProducer(): KafkaProducer[String,String] ={
val props = new Properties ()
props.put ("bootstrap.servers", "localhost:9092")
props.put ("acks", "all")
props.put ("key.serializer","org.apache.kafka.common.serialization.StringSerializer")
props.put ("value.serializer","org.apache.kafka.common.serialization.StringSerializer")
new KafkaProducer[String, String] (props)
}
def writeDataToKafka(producer:KafkaProducer[String,String],content:String):Unit = {
producer.send(new ProducerRecord[String,String]("RoadRealTimeLog",content))
}
def closeKafka(producer:KafkaProducer[String,String]):Unit = {
producer.close()
}
def mock(): Unit = {
val pw = initFile("路径")
val producer = initKafkaProducer()
val random = new Random()
val locations = Array("鲁","京","豫","京","沪","赣","津","深","黑","粤")
val day = new SimpleDateFormat ("yyyy-MM-dd").format (new Date())
for(i<-0 until 3000) {
val car = locations (random.nextInt (10)) + (65 + random.nextInt (26)).asInstanceOf[Char]+ randomNum(5, random)
var baseActionTime = day +""+ fillZero(random, 24,2)
for(j <- 0 until random.nextInt (300)) {
if (j % 30 == 0&j!=0) {
var nextHour = ""
val baseHour = baseActionTime.split(" ")(1)
if (baseHour.startsWith("0")) {
if (baseHour.endsWith("9")) {
nextHour = "10"
} else {
nextHour = "0" + (baseHour.substring(1).toInt + 1).toString
}
} else if (baseHour == "23") {
nextHour = fillZero(random, 24, 2)
} else {
nextHour = (baseHour.toInt + 1).toString
}
baseActionTime = day + " " + nextHour
}
val actionTime = baseActionTime + ":" + fillZero(random, 60, 2) + ":" + fillZero(random, 60, 2)
val monitorId = fillZero(random, 10, 4)
val speed = random.nextInt (200) +1
val roadId = random.nextInt (50)+1
val cameraId= "0"+ randomNum(4, random)
val areald = fillZero(random, random.nextInt(8) +1, 2)
val content = day + "\t"+ monitorId +"\t" + cameraId + "\t" + car + "\t" + actionTime + "\t" + speed + "\t" + roadId + "\t"+areald
writeDataToFile(pw, content)
writeDataToKafka(producer,content)
Thread.sleep(50)
}
}
closeFile(pw)
closeKafka(producer)
}
def main(args:Array[String]):Unit = {
mock()
}
}

View File

@@ -1,4 +1,4 @@
package com.aisi.spark
package com.aisi.sparkSql
import org.apache.hive.jdbc.HiveDriver

View File

@@ -1,4 +1,4 @@
package com.aisi.spark
package com.aisi.sparkSql
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.{DataFrame, SQLContext}

View File

@@ -1,4 +1,4 @@
package com.aisi.spark
package com.aisi.sparkSql
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.expressions.Window

View File

@@ -1,4 +1,4 @@
package com.aisi.spark
package com.aisi.sparkSql
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}

View File

@@ -1,4 +1,4 @@
package com.aisi.spark
package com.aisi.sparkSql
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.expressions.Window

View File

@@ -1,4 +1,4 @@
package com.aisi.spark
package com.aisi.sparkSql
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SQLContext}

View File

@@ -1,4 +1,4 @@
package com.aisi.spark
package com.aisi.sparkSql
import org.apache.spark.sql.expressions.Aggregator

View File

@@ -1,4 +1,4 @@
package com.aisi.spark
package com.aisi.sparkSql
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}

View File

@@ -1,4 +1,4 @@
package com.aisi.spark
package com.aisi.sparkSql
import org.apache.spark.sql.SparkSession

View File

@@ -0,0 +1,21 @@
package com.aisi.sparkSreaming
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
object TestStreaming {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local[*]")
conf.setAppName("testStreaming")
val ssc = new StreamingContext(conf, Seconds(5))
val ds = ssc.socketTextStream("localhost", 6666)
val ds1 = ds.flatMap(_.split(" "))
.map((_, 1))
.reduceByKey(_ + _)
ds1.print()
ssc.start()
ssc.awaitTermination()
}
}