add spark-sql case
This commit is contained in:
parent
8b4a30b940
commit
4726888819
Binary file not shown.
Binary file not shown.
|
|
@ -0,0 +1,15 @@
|
||||||
|
1,Toy Story (1995),Animation
|
||||||
|
1,Toy Story (1995),Children's
|
||||||
|
1,Toy Story (1995),Comedy
|
||||||
|
2,Jumanji (1995),Adventure
|
||||||
|
2,Jumanji (1995),Children's
|
||||||
|
2,Jumanji (1995),Fantasy
|
||||||
|
3,Grumpier Old Men (1995),Comedy
|
||||||
|
3,Grumpier Old Men (1995),Romance
|
||||||
|
4,Waiting to Exhale (1995),Comedy
|
||||||
|
4,Waiting to Exhale (1995),Drama
|
||||||
|
5,Father of the Bride Part II (1995),Comedy
|
||||||
|
6,Heat (1995),Action
|
||||||
|
6,Heat (1995),Crime
|
||||||
|
6,Heat (1995),Thriller
|
||||||
|
7,Sabrina (1995),Comedy
|
||||||
|
Binary file not shown.
Binary file not shown.
|
|
@ -0,0 +1,2 @@
|
||||||
|
{"age":30,"name":"John"}
|
||||||
|
{"age":25,"name":"Doe"}
|
||||||
|
|
@ -0,0 +1,3 @@
|
||||||
|
1 zhangsan 20000 10000
|
||||||
|
2 lisi 21000 20000
|
||||||
|
3 wangwu 22000 21000
|
||||||
|
|
@ -0,0 +1,4 @@
|
||||||
|
[
|
||||||
|
{"name": "John", "age": 30},
|
||||||
|
{"name": "Doe", "age": 25}
|
||||||
|
]
|
||||||
Binary file not shown.
Binary file not shown.
|
|
@ -0,0 +1,13 @@
|
||||||
|
shenjianZ poop yuqing yuqin
|
||||||
|
yuqing yuqin shenjianZ poop
|
||||||
|
shenjianZ poop yuqing yuqin
|
||||||
|
yuqing yuqin shenjianZ poop
|
||||||
|
shenjianZ poop yuqing yuqin
|
||||||
|
yuqing yuqin shenjianZ poop
|
||||||
|
shenjianZ poop yuqing yuqin
|
||||||
|
yuqing yuqin shenjianZ poop
|
||||||
|
shenjianZ poop yuqing yuqin
|
||||||
|
yuqing yuqin shenjianZ poop
|
||||||
|
shenjianZ poop yuqing yuqin
|
||||||
|
yuqing yuqin shenjianZ poop
|
||||||
|
|
||||||
|
|
@ -33,6 +33,11 @@
|
||||||
<artifactId>slf4j-log4j12</artifactId>
|
<artifactId>slf4j-log4j12</artifactId>
|
||||||
<version>1.7.30</version>
|
<version>1.7.30</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>mysql</groupId>
|
||||||
|
<artifactId>mysql-connector-java</artifactId>
|
||||||
|
<version>8.0.33</version> <!-- 确保使用与数据库兼容的版本 -->
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>log4j</groupId>
|
<groupId>log4j</groupId>
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,10 @@
|
||||||
package com.aisi.spark
|
package com.aisi.spark
|
||||||
|
|
||||||
import org.apache.spark.rdd.RDD
|
|
||||||
import org.apache.spark.sql.expressions.Window
|
import org.apache.spark.sql.expressions.Window
|
||||||
import org.apache.spark.sql.{DataFrame, SQLContext}
|
import org.apache.spark.sql.{DataFrame, SQLContext}
|
||||||
import org.apache.spark.{SparkConf, SparkContext}
|
import org.apache.spark.{SparkConf, SparkContext}
|
||||||
|
|
||||||
object TestMovieWithSql {
|
object TestMovieWithPureSql {
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
val conf = new SparkConf()
|
val conf = new SparkConf()
|
||||||
conf.setAppName("movie")
|
conf.setAppName("movie")
|
||||||
|
|
@ -35,15 +34,33 @@ object TestMovieWithSql {
|
||||||
}).toDF("userId", "movieId", "score")
|
}).toDF("userId", "movieId", "score")
|
||||||
df1.limit(10).show()
|
df1.limit(10).show()
|
||||||
import org.apache.spark.sql.functions._
|
import org.apache.spark.sql.functions._
|
||||||
df.join(df1,"movieId").groupBy("userId","movieType")
|
|
||||||
.agg(count("userId").as("cnt"))
|
df.createTempView("movie") //("movieId", "movieName", "movieType")
|
||||||
.withColumn("rn",row_number().over(Window.partitionBy("userId").orderBy($"cnt".desc)))
|
df1.createTempView("ratings") // ("userId", "movieId", "score")
|
||||||
.where("rn = 1") // 取到 userId 分区的 第一个,就是 cnt最大的那条数据
|
// cnt 即为该用户 所看影片出现类型(movieType)最多的 次数count
|
||||||
.show()
|
sqlSc.sql(
|
||||||
//
|
"""
|
||||||
// row_number():这是一个窗口函数,用来为分组中的每一行分配一个唯一的行号。行号按照排序规则依次递增。
|
|SELECT userId, movieType, cnt, rn
|
||||||
// Window.partitionBy("userId"):partitionBy 会根据 userId 划分数据,即对于每个 userId 来独立计算行号,这意味着每个用户的数据被视为一个独立的分区。
|
|FROM (
|
||||||
// orderBy($"cnt".desc):在每个 userId 分区内,数据会根据 cnt(即该用户对某种电影类型的观看次数)降序排列。行号根据这种排序结果分配。
|
| SELECT *,
|
||||||
|
| row_number() OVER (PARTITION BY userId ORDER BY cnt DESC) AS rn
|
||||||
|
| FROM (
|
||||||
|
| SELECT COUNT(1) AS cnt,
|
||||||
|
| userId,
|
||||||
|
| movieType
|
||||||
|
| FROM (
|
||||||
|
| SELECT userId,
|
||||||
|
| movieType
|
||||||
|
| FROM movie m
|
||||||
|
| JOIN ratings r ON m.movieId = r.movieId
|
||||||
|
| ) t
|
||||||
|
| GROUP BY userId, movieType
|
||||||
|
| ) t1
|
||||||
|
|) t2
|
||||||
|
|WHERE rn = 1
|
||||||
|
|""".stripMargin
|
||||||
|
).show(20)
|
||||||
|
|
||||||
|
|
||||||
// +------+---------+---+---+
|
// +------+---------+---+---+
|
||||||
// |userId|movieType|cnt| rn|
|
// |userId|movieType|cnt| rn|
|
||||||
|
|
@ -0,0 +1,58 @@
|
||||||
|
package com.aisi.spark
|
||||||
|
|
||||||
|
import org.apache.spark.rdd.RDD
|
||||||
|
import org.apache.spark.sql.expressions.Window
|
||||||
|
import org.apache.spark.sql.{DataFrame, SQLContext}
|
||||||
|
import org.apache.spark.{SparkConf, SparkContext}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 每个用户最喜欢哪个类型的电影
|
||||||
|
* 每个类型中最受欢迎的前三个电影?
|
||||||
|
* 然后给用户推荐
|
||||||
|
*/
|
||||||
|
object TestMovieWithSqlApi {
|
||||||
|
def main(args: Array[String]): Unit = {
|
||||||
|
val conf = new SparkConf()
|
||||||
|
conf.setAppName("movie")
|
||||||
|
conf.setMaster("local[*]")
|
||||||
|
// conf.set("spark.shuffle.partitions", "20")
|
||||||
|
val sc = new SparkContext(conf)
|
||||||
|
val sqlSc = new SQLContext(sc)
|
||||||
|
|
||||||
|
import sqlSc.implicits._
|
||||||
|
val df: DataFrame = sc.textFile("Spark/data/movies.txt")
|
||||||
|
.flatMap(t => {
|
||||||
|
val line = t.split(",")
|
||||||
|
val movieId = line(0)
|
||||||
|
val movieTypes = line.reverse.head
|
||||||
|
val movieName = line.tail.reverse.tail.reverse.mkString(" ")
|
||||||
|
// movieTypes.split("\\|").map(movieRecord(movieId, movieName, _))
|
||||||
|
movieTypes.split("\\|").map(movieType => (movieId, movieName, movieType)) // 返回三元组
|
||||||
|
}).toDF("movieId", "movieName", "movieType")
|
||||||
|
// df.limit(10).show()
|
||||||
|
val df1 = sc.textFile("Spark/data/ratings.txt")
|
||||||
|
.map(t => {
|
||||||
|
val line = t.split(",")
|
||||||
|
val userId = line(0)
|
||||||
|
val movieId = line(1)
|
||||||
|
val score = line(2).toDouble
|
||||||
|
(userId, movieId, score)
|
||||||
|
}).toDF("userId", "movieId", "score")
|
||||||
|
df1.limit(10).show()
|
||||||
|
import org.apache.spark.sql.functions._
|
||||||
|
val df11 = df.join(df1, "movieId").groupBy("userId", "movieType")
|
||||||
|
.agg(count("userId").as("cnt"))
|
||||||
|
.withColumn("rn", row_number().over(Window.partitionBy("userId").orderBy($"cnt".desc)))
|
||||||
|
.where("rn = 1") // 取到 userId 分区的 第一个,就是 cnt最大的那条数据
|
||||||
|
.select("userId", "movieType")
|
||||||
|
val df22 = df.join(df1, "movieId").groupBy("movieType", "movieName")
|
||||||
|
.agg(avg("score").as("avg"))
|
||||||
|
.withColumn("rn", row_number().over(Window.partitionBy("movieType").orderBy($"avg".desc)))
|
||||||
|
.where("rn < 4")
|
||||||
|
.select("movieType", "movieName")
|
||||||
|
df11.join(df22, "movieType").show()
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
//case class movieRecord(var movieId:String,var movieName:String, var movieTypes:String)
|
||||||
|
|
@ -0,0 +1,76 @@
|
||||||
|
package com.aisi.spark
|
||||||
|
|
||||||
|
import org.apache.spark.sql.{DataFrame, SQLContext}
|
||||||
|
import org.apache.spark.{SparkConf, SparkContext}
|
||||||
|
import java.util.Properties
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 每个用户最喜欢哪个类型的电影
|
||||||
|
* 每个类型中最受欢迎的前三个电影?
|
||||||
|
* 然后给用户推荐
|
||||||
|
*/
|
||||||
|
object TestMovieWithWriteApi {
|
||||||
|
def main(args: Array[String]): Unit = {
|
||||||
|
val conf = new SparkConf()
|
||||||
|
conf.setAppName("movie")
|
||||||
|
conf.setMaster("local[*]")
|
||||||
|
// 添加禁用文件权限检查的配置
|
||||||
|
conf.set("spark.hadoop.fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem")
|
||||||
|
|
||||||
|
val sc = new SparkContext(conf)
|
||||||
|
val sqlSc = new SQLContext(sc)
|
||||||
|
|
||||||
|
import sqlSc.implicits._
|
||||||
|
|
||||||
|
// 读取数据并转换为 DataFrame
|
||||||
|
val df: DataFrame = sc.textFile("Spark/data/movies.txt")
|
||||||
|
.flatMap(t => {
|
||||||
|
val line = t.split(",")
|
||||||
|
val movieId = line(0)
|
||||||
|
val movieTypes = line.reverse.head
|
||||||
|
val movieName = line.tail.reverse.tail.reverse.mkString(" ")
|
||||||
|
// 将电影类型拆分为多个记录
|
||||||
|
movieTypes.split("\\|").map(movieType => (movieId, movieName, movieType)) // 返回三元组
|
||||||
|
}).toDF("movieId", "movieName", "movieType").limit(15)
|
||||||
|
|
||||||
|
// 打印 DataFrame 内容
|
||||||
|
df.show()
|
||||||
|
//
|
||||||
|
// (1) 通过 JDBC 存储到 MySql
|
||||||
|
// // 设置数据库连接属性
|
||||||
|
// val properties = new Properties()
|
||||||
|
// properties.put("user", "root")
|
||||||
|
// properties.put("password", "root")
|
||||||
|
// properties.put("driver", "com.mysql.cj.jdbc.Driver") // 指定 MySQL JDBC 驱动类
|
||||||
|
// import org.apache.spark.sql.functions._
|
||||||
|
// // 添加 id 列,但在写入数据库时省略它
|
||||||
|
// val dfWithId = df.withColumn("id", monotonically_increasing_id())
|
||||||
|
// // 选择需要写入 MySQL 的列,省略 id 列
|
||||||
|
// val finalDf = dfWithId.select("movieId", "movieName", "movieType")
|
||||||
|
// // 写入 MySQL 数据库
|
||||||
|
// finalDf.write.mode("append").jdbc("jdbc:mysql://localhost:3306/test", "movie", properties)
|
||||||
|
|
||||||
|
// (2) 存储为 CSV 格式
|
||||||
|
// df.write.csv("Spark/data/csv")
|
||||||
|
// df.write.csv("data/csv")
|
||||||
|
// df.write.format("org.apache.spark.sql.execution.datasources.v2.csv.CSVDataSourceV2")
|
||||||
|
// .save("Spark/data/csv")
|
||||||
|
|
||||||
|
// (3) 以 parquet 写
|
||||||
|
// df.write.parquet("Spark/data/parquet")
|
||||||
|
// df.write.format("org.apache.spark.sql.execution.datasources.v2.parquet.ParquetDataSourceV2")
|
||||||
|
// .save("Spark/data/parquet")
|
||||||
|
|
||||||
|
|
||||||
|
// (4) 以 json 写
|
||||||
|
// val df1 = sqlSc.read.json("Spark/data/stu.json")
|
||||||
|
// df1.select("_corrupt_record").show(false)
|
||||||
|
// df1.write.mode("overwrite").format("json").save("Spark/data/json")
|
||||||
|
|
||||||
|
|
||||||
|
// (5) 以 text文本写
|
||||||
|
// val df1 = sqlSc.read.text("Spark/data/word.txt")
|
||||||
|
// df1.write.format("org.apache.spark.sql.execution.datasources.v2.text.TextDataSourceV2")
|
||||||
|
// .save("Spark/data/text")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,45 @@
|
||||||
|
package com.aisi.spark
|
||||||
|
|
||||||
|
import org.apache.spark.rdd.RDD
|
||||||
|
import org.apache.spark.sql.expressions.Window
|
||||||
|
import org.apache.spark.sql.functions._
|
||||||
|
import org.apache.spark.sql.{DataFrame, SQLContext}
|
||||||
|
import org.apache.spark.{SparkConf, SparkContext} // 导入 Window
|
||||||
|
|
||||||
|
object TestSparkPureSql {
|
||||||
|
def main(args: Array[String]): Unit = {
|
||||||
|
val conf = new SparkConf()
|
||||||
|
conf.setMaster("local[*]")
|
||||||
|
conf.setAppName("test sql")
|
||||||
|
val sc = new SparkContext(conf)
|
||||||
|
val sqlSc = new SQLContext(sc)
|
||||||
|
//引入环境信息
|
||||||
|
import sqlSc.implicits._
|
||||||
|
val rdd: RDD[(Int, String, Int, String)] = sc.textFile("Spark/data/a.txt")
|
||||||
|
.map(t => {
|
||||||
|
val line = t.split(" ")
|
||||||
|
val id = line(0).toInt
|
||||||
|
val name = line(1)
|
||||||
|
val age = line(2).toInt
|
||||||
|
val gender = line(3)
|
||||||
|
(id, name, age, gender)
|
||||||
|
})
|
||||||
|
val df: DataFrame = rdd.toDF("id", "name", "age", "gender")
|
||||||
|
df.show() //展示表数据
|
||||||
|
df.printSchema() //展示表格字段信息
|
||||||
|
// df.where("age > 20").groupBy("gender").sum("age").show()
|
||||||
|
// df.orderBy($"age".desc).show()
|
||||||
|
// 聚合并排序
|
||||||
|
// val result = df.groupBy("gender")
|
||||||
|
// .agg(
|
||||||
|
// count("id").as("count_id"), // 计数 id
|
||||||
|
// sum("age").as("sum_age") // 求和 age
|
||||||
|
// )
|
||||||
|
// .orderBy($"sum_age".desc) // 按 sum_age 降序排序
|
||||||
|
// result.show()
|
||||||
|
|
||||||
|
df.createTempView("stu")
|
||||||
|
val df1 = sqlSc.sql("select count(1) as gender_count,gender from stu group by gender")
|
||||||
|
df1.show()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,42 @@
|
||||||
|
package com.aisi.spark
|
||||||
|
|
||||||
|
|
||||||
|
import org.apache.spark.sql.expressions.Aggregator
|
||||||
|
import org.apache.spark.sql.{DataFrame, Encoder, Encoders, SparkSession, functions}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* UDAF 的定义和使用
|
||||||
|
* 计算所有学生的年龄和
|
||||||
|
*/
|
||||||
|
object TestUDAF {
|
||||||
|
def main(args: Array[String]): Unit = {
|
||||||
|
val sparkSession = SparkSession.builder().appName("testUDAF").master("local[*]").getOrCreate()
|
||||||
|
import sparkSession.implicits._
|
||||||
|
val df: DataFrame = sparkSession.sparkContext.textFile("Spark/data/a.txt")
|
||||||
|
.map(t => {
|
||||||
|
val line = t.split(" ")
|
||||||
|
(line(0), line(1), line(2).toInt, line(3))
|
||||||
|
}).toDF("id", "name", "age", "gender")
|
||||||
|
import org.apache.spark.sql.functions._
|
||||||
|
// 传统的方法
|
||||||
|
// val df1 = df.agg(avg("age")).show()
|
||||||
|
// val df2 = df.groupBy("gender").avg("age").show()
|
||||||
|
val mySum = new MySum()
|
||||||
|
sparkSession.udf.register("mysum",functions.udaf(mySum))
|
||||||
|
df.createTempView("stu")
|
||||||
|
sparkSession.sql("select mysum(age) from stu").show()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
class MySum extends Aggregator[Int,Int,Int]{
|
||||||
|
override def zero: Int = 0
|
||||||
|
|
||||||
|
override def reduce(b: Int, a: Int): Int = b+a
|
||||||
|
|
||||||
|
override def merge(b1: Int, b2: Int): Int = b1+b2
|
||||||
|
|
||||||
|
override def finish(reduction: Int): Int = reduction
|
||||||
|
|
||||||
|
override def bufferEncoder: Encoder[Int] = Encoders.scalaInt
|
||||||
|
|
||||||
|
override def outputEncoder: Encoder[Int] = Encoders.scalaInt
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,30 @@
|
||||||
|
package com.aisi.spark
|
||||||
|
|
||||||
|
import org.apache.spark.rdd.RDD
|
||||||
|
import org.apache.spark.sql.{DataFrame, SparkSession}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* UDF 的定义和使用
|
||||||
|
* 统计每个人的一年的收入 : 薪资+奖金 salary * 12 + bonus
|
||||||
|
*/
|
||||||
|
object TestUDF {
|
||||||
|
def main(args: Array[String]): Unit = {
|
||||||
|
val sparkSession = SparkSession.builder().appName("testUDF").master("local[*]").getOrCreate()
|
||||||
|
import sparkSession.implicits._
|
||||||
|
val df: DataFrame = sparkSession.sparkContext.textFile("Spark/data/salary.txt")
|
||||||
|
.map(t => {
|
||||||
|
val line = t.split(" ")
|
||||||
|
(line(0), line(1), line(2).toInt, line(3).toInt)
|
||||||
|
}).toDF("id", "name", "salary", "bonus")
|
||||||
|
sparkSession.udf.register("all_income", (salary: Int, bonus: Int) => salary * 12 + bonus)
|
||||||
|
df.createTempView("salary")
|
||||||
|
// 使用 SQL 方式调用 UDF
|
||||||
|
sparkSession.sql("select id,name, all_income(salary,bonus) all from salary").show()
|
||||||
|
import org.apache.spark.sql.functions
|
||||||
|
// 使用 SQL API 方式调用 UDF
|
||||||
|
df.withColumn("all",functions.callUDF("all_income",$"salary",$"bonus"))
|
||||||
|
.select("id","name","all")
|
||||||
|
.show()
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,43 @@
|
||||||
|
package com.aisi.spark
|
||||||
|
|
||||||
|
import org.apache.spark.sql.SparkSession
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 在 Spark 中同时使用 Hive 和临时视图不会冲突,你可以在同一个 SparkSession 中同时使用它们。
|
||||||
|
* 不会冲突的原因:
|
||||||
|
* 作用范围: 临时视图只在当前 Spark 会话中有效,而 Hive 表是全局的。使用临时视图不会干扰到 Hive 表。
|
||||||
|
* 名称冲突: 如果临时视图和 Hive 表具有相同名称,Spark 会优先使用临时视图。
|
||||||
|
* 在 SQL 查询中使用表名时,Spark 会首先查找临时视图,如果找不到再查找 Hive 表。这种行为可以帮助避免命名冲突的问题。
|
||||||
|
*/
|
||||||
|
object TestWithSparkSession {
|
||||||
|
def main(args: Array[String]): Unit = {
|
||||||
|
val sparkSession = SparkSession.builder()
|
||||||
|
.enableHiveSupport() // 启用 Hive 支持
|
||||||
|
.master("local[*]")
|
||||||
|
.appName("test")
|
||||||
|
.getOrCreate()
|
||||||
|
|
||||||
|
// 创建示例 DataFrame
|
||||||
|
import sparkSession.implicits._
|
||||||
|
val data = Seq(
|
||||||
|
("John", 30),
|
||||||
|
("Doe", 25)
|
||||||
|
)
|
||||||
|
val df = data.toDF("name", "age")
|
||||||
|
|
||||||
|
// 创建临时视图
|
||||||
|
df.createOrReplaceTempView("people")
|
||||||
|
|
||||||
|
// 使用 Spark SQL 查询临时视图
|
||||||
|
val resultTempView = sparkSession.sql("SELECT * FROM people WHERE age > 28")
|
||||||
|
resultTempView.show()
|
||||||
|
|
||||||
|
// 假设 Hive 中有一个表叫 "hive_people"
|
||||||
|
// 使用 Spark SQL 查询 Hive 表
|
||||||
|
val resultHive = sparkSession.sql("SELECT * FROM hive_people WHERE age > 28")
|
||||||
|
resultHive.show()
|
||||||
|
|
||||||
|
// 停止 SparkSession
|
||||||
|
sparkSession.stop()
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue