commit
This commit is contained in:
4
Spark/data/a.txt
Normal file
4
Spark/data/a.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
1 zhangsan 20 male
|
||||
2 lisi 30 female
|
||||
3 wangwu 35 male
|
||||
4 zhaosi 40 female
|
||||
3883
Spark/data/movies.txt
Normal file
3883
Spark/data/movies.txt
Normal file
File diff suppressed because it is too large
Load Diff
1000209
Spark/data/ratings.txt
Normal file
1000209
Spark/data/ratings.txt
Normal file
File diff suppressed because it is too large
Load Diff
13
Spark/data/word.txt
Normal file
13
Spark/data/word.txt
Normal file
@@ -0,0 +1,13 @@
|
||||
shenjianZ poop yuqing yuqin
|
||||
yuqing yuqin shenjianZ poop
|
||||
shenjianZ poop yuqing yuqin
|
||||
yuqing yuqin shenjianZ poop
|
||||
shenjianZ poop yuqing yuqin
|
||||
yuqing yuqin shenjianZ poop
|
||||
shenjianZ poop yuqing yuqin
|
||||
yuqing yuqin shenjianZ poop
|
||||
shenjianZ poop yuqing yuqin
|
||||
yuqing yuqin shenjianZ poop
|
||||
shenjianZ poop yuqing yuqin
|
||||
yuqing yuqin shenjianZ poop
|
||||
|
||||
41
Spark/dependency-reduced-pom.xml
Normal file
41
Spark/dependency-reduced-pom.xml
Normal file
@@ -0,0 +1,41 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<parent>
|
||||
<artifactId>HaiNiuProjects</artifactId>
|
||||
<groupId>com.aisi</groupId>
|
||||
<version>1.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<artifactId>Spark</artifactId>
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<artifactId>maven-shade-plugin</artifactId>
|
||||
<version>3.2.4</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>package</phase>
|
||||
<goals>
|
||||
<goal>shade</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
<configuration>
|
||||
<filters>
|
||||
<filter>
|
||||
<artifact>*:commons-beanutils-core</artifact>
|
||||
<excludes>
|
||||
<exclude>META-INF/services/**</exclude>
|
||||
</excludes>
|
||||
</filter>
|
||||
</filters>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
<properties>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
<maven.compiler.source>8</maven.compiler.source>
|
||||
<maven.compiler.target>8</maven.compiler.target>
|
||||
</properties>
|
||||
</project>
|
||||
98
Spark/pom.xml
Normal file
98
Spark/pom.xml
Normal file
@@ -0,0 +1,98 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<groupId>com.aisi</groupId>
|
||||
<artifactId>HaiNiuProjects</artifactId>
|
||||
<version>1.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>Spark</artifactId>
|
||||
|
||||
<properties>
|
||||
<maven.compiler.source>8</maven.compiler.source>
|
||||
<maven.compiler.target>8</maven.compiler.target>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
</properties>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.12</artifactId>
|
||||
<version>3.1.2</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-client</artifactId>
|
||||
<version>2.7.3</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-log4j12</artifactId>
|
||||
<version>1.7.30</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>log4j</groupId>
|
||||
<artifactId>log4j</artifactId>
|
||||
<version>1.2.17</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_2.12</artifactId> <!-- 根据你的 Scala 版本选择合适的版本 -->
|
||||
<version>3.1.2</version> <!-- 根据你的 Spark 版本选择合适的版本 -->
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hive</groupId>
|
||||
<artifactId>hive-jdbc</artifactId>
|
||||
<version>3.1.2</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
<build>
|
||||
<plugins>
|
||||
<!-- Scala Maven Plugin -->
|
||||
<!-- <plugin>-->
|
||||
<!-- <groupId>net.alchim31.maven</groupId>-->
|
||||
<!-- <artifactId>scala-maven-plugin</artifactId>-->
|
||||
<!-- <version>4.5.3</version>-->
|
||||
<!-- <executions>-->
|
||||
<!-- <execution>-->
|
||||
<!-- <goals>-->
|
||||
<!-- <goal>compile</goal>-->
|
||||
<!-- <goal>testCompile</goal>-->
|
||||
<!-- </goals>-->
|
||||
<!-- </execution>-->
|
||||
<!-- </executions>-->
|
||||
<!-- </plugin>-->
|
||||
|
||||
<!-- Maven Shade Plugin -->
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-shade-plugin</artifactId>
|
||||
<version>3.2.4</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>package</phase>
|
||||
<goals>
|
||||
<goal>shade</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
<configuration>
|
||||
<filters>
|
||||
<filter>
|
||||
<artifact>*:commons-beanutils-core</artifact>
|
||||
<excludes>
|
||||
<exclude>META-INF/services/**</exclude>
|
||||
</excludes>
|
||||
</filter>
|
||||
</filters>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
</project>
|
||||
17
Spark/src/main/java/com/aisi/Main.java
Normal file
17
Spark/src/main/java/com/aisi/Main.java
Normal file
@@ -0,0 +1,17 @@
|
||||
package com.aisi;
|
||||
|
||||
//TIP 要<b>运行</b>代码,请按 <shortcut actionId="Run"/> 或
|
||||
// 点击装订区域中的 <icon src="AllIcons.Actions.Execute"/> 图标。
|
||||
public class Main {
|
||||
public static void main(String[] args) {
|
||||
//TIP 当文本光标位于高亮显示的文本处时按 <shortcut actionId="ShowIntentionActions"/>
|
||||
// 查看 IntelliJ IDEA 建议如何修正。
|
||||
System.out.printf("Hello and welcome!");
|
||||
|
||||
for (int i = 1; i <= 5; i++) {
|
||||
//TIP 按 <shortcut actionId="Debug"/> 开始调试代码。我们已经设置了一个 <icon src="AllIcons.Debugger.Db_set_breakpoint"/> 断点
|
||||
// 但您始终可以通过按 <shortcut actionId="ToggleLineBreakpoint"/> 添加更多断点。
|
||||
System.out.println("i = " + i);
|
||||
}
|
||||
}
|
||||
}
|
||||
27
Spark/src/main/java/com/aisi/spark/WordCount.scala
Normal file
27
Spark/src/main/java/com/aisi/spark/WordCount.scala
Normal file
@@ -0,0 +1,27 @@
|
||||
package com.aisi.spark
|
||||
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
|
||||
object WordCount {
|
||||
def main(args: Array[String]): Unit = {
|
||||
val config = new SparkConf()
|
||||
config.setMaster("local")
|
||||
config.setAppName("SH_wordcount")
|
||||
|
||||
// 禁用权限设置
|
||||
config.set("spark.hadoop.dfs.permissions", "false")
|
||||
|
||||
val sc = new SparkContext(config)
|
||||
|
||||
// 使用 Windows 文件路径格式
|
||||
val rdd = sc.textFile("D:/JetBrainsToolProject/IntelJ IDEA/HaiNiuProjects/Spark/data/word.txt")
|
||||
|
||||
val rdd1 = rdd.flatMap(_.split(" "))
|
||||
val rdd2 = rdd1.map((_, 1))
|
||||
val rdd3 = rdd2.groupBy(_._1)
|
||||
val rdd4 = rdd3.mapValues(_.size)
|
||||
|
||||
// 保存结果到本地路径
|
||||
rdd4.saveAsTextFile("D:/JetBrainsToolProject/IntelJ IDEA/HaiNiuProjects/Spark/data/res")
|
||||
}
|
||||
}
|
||||
20
Spark/src/main/java/com/aisi/spark/WordCountForCluster.scala
Normal file
20
Spark/src/main/java/com/aisi/spark/WordCountForCluster.scala
Normal file
@@ -0,0 +1,20 @@
|
||||
package com.aisi.spark
|
||||
|
||||
import org.apache.spark.SparkContext
|
||||
|
||||
object WordCountForCluster {
|
||||
def main(args: Array[String]): Unit = {
|
||||
val sc = new SparkContext()
|
||||
val Array(input,output) = args
|
||||
// 使用 Windows 文件路径格式
|
||||
val rdd = sc.textFile(input)
|
||||
|
||||
val rdd1 = rdd.flatMap(_.split(" "))
|
||||
val rdd2 = rdd1.map((_, 1))
|
||||
val rdd3 = rdd2.groupBy(_._1)
|
||||
val rdd4 = rdd3.mapValues(_.size)
|
||||
|
||||
// 保存结果到本地路径
|
||||
rdd4.saveAsTextFile(output)
|
||||
}
|
||||
}
|
||||
6
Spark/src/main/resources/log4j.properties
Normal file
6
Spark/src/main/resources/log4j.properties
Normal file
@@ -0,0 +1,6 @@
|
||||
log4j.rootLogger=info,console
|
||||
|
||||
log4j.appender.console=org.apache.log4j.ConsoleAppender
|
||||
log4j.appender.console.target=System.out
|
||||
log4j.appender.console.layout=org.apache.log4j.PatternLayout
|
||||
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c %M(): %m%n
|
||||
19
Spark/src/main/scala/com/aisi/spark/TestBeeline.scala
Normal file
19
Spark/src/main/scala/com/aisi/spark/TestBeeline.scala
Normal file
@@ -0,0 +1,19 @@
|
||||
package com.aisi.spark
|
||||
|
||||
import org.apache.hive.jdbc.HiveDriver
|
||||
|
||||
import java.sql.DriverManager
|
||||
|
||||
object TestBeeline{
|
||||
def main(args: Array[String]): Unit = {
|
||||
classOf[HiveDriver]
|
||||
val connection = DriverManager.getConnection("jdbc:hive2://nn1:20000", "hadoop", null)
|
||||
val statement = connection.prepareStatement("SELECT count(1) as cnt from stu;")
|
||||
val resultSet = statement.executeQuery()
|
||||
while (resultSet.next()){
|
||||
val cnt = resultSet.getLong("cnt")
|
||||
println("stu表的总条数为:" + cnt)
|
||||
}
|
||||
connection.close()
|
||||
}
|
||||
}
|
||||
75
Spark/src/main/scala/com/aisi/spark/TestMovieWithSql.scala
Normal file
75
Spark/src/main/scala/com/aisi/spark/TestMovieWithSql.scala
Normal file
@@ -0,0 +1,75 @@
|
||||
package com.aisi.spark
|
||||
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.expressions.Window
|
||||
import org.apache.spark.sql.{DataFrame, SQLContext}
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
|
||||
object TestMovieWithSql {
|
||||
def main(args: Array[String]): Unit = {
|
||||
val conf = new SparkConf()
|
||||
conf.setAppName("movie")
|
||||
conf.setMaster("local[*]")
|
||||
conf.set("spark.shuffle.partitions", "20")
|
||||
val sc = new SparkContext(conf)
|
||||
val sqlSc = new SQLContext(sc)
|
||||
|
||||
import sqlSc.implicits._
|
||||
val df: DataFrame = sc.textFile("Spark/data/movies.txt")
|
||||
.flatMap(t => {
|
||||
val line = t.split(",")
|
||||
val movieId = line(0)
|
||||
val movieTypes = line.reverse.head
|
||||
val movieName = line.tail.reverse.tail.reverse.mkString(" ")
|
||||
// movieTypes.split("\\|").map(movieRecord(movieId, movieName, _))
|
||||
movieTypes.split("\\|").map(movieType => (movieId, movieName, movieType)) // 返回三元组
|
||||
}).toDF("movieId", "movieName", "movieType")
|
||||
// df.limit(10).show()
|
||||
val df1 = sc.textFile("Spark/data/ratings.txt")
|
||||
.map(t => {
|
||||
val line = t.split(",")
|
||||
val userId = line(0)
|
||||
val movieId = line(1)
|
||||
val score = line(2).toDouble
|
||||
(userId, movieId, score)
|
||||
}).toDF("userId", "movieId", "score")
|
||||
df1.limit(10).show()
|
||||
import org.apache.spark.sql.functions._
|
||||
df.join(df1,"movieId").groupBy("userId","movieType")
|
||||
.agg(count("userId").as("cnt"))
|
||||
.withColumn("rn",row_number().over(Window.partitionBy("userId").orderBy($"cnt".desc)))
|
||||
.where("rn = 1") // 取到 userId 分区的 第一个,就是 cnt最大的那条数据
|
||||
.show()
|
||||
//
|
||||
// row_number():这是一个窗口函数,用来为分组中的每一行分配一个唯一的行号。行号按照排序规则依次递增。
|
||||
// Window.partitionBy("userId"):partitionBy 会根据 userId 划分数据,即对于每个 userId 来独立计算行号,这意味着每个用户的数据被视为一个独立的分区。
|
||||
// orderBy($"cnt".desc):在每个 userId 分区内,数据会根据 cnt(即该用户对某种电影类型的观看次数)降序排列。行号根据这种排序结果分配。
|
||||
|
||||
// +------+---------+---+---+
|
||||
// |userId|movieType|cnt| rn|
|
||||
// +------+---------+---+---+
|
||||
// | 1090| Drama| 47| 1|
|
||||
// | 1159| Sci-Fi|102| 1|
|
||||
// | 1436| Drama| 33| 1|
|
||||
// | 1512| Comedy| 19| 1|
|
||||
// | 1572| Thriller| 16| 1|
|
||||
// | 2069| Comedy| 30| 1|
|
||||
// | 2088| Sci-Fi|201| 1|
|
||||
// | 2136| Comedy| 83| 1|
|
||||
// | 2162| Comedy|105| 1|
|
||||
// | 2294| Drama| 35| 1|
|
||||
// | 2904| Drama| 34| 1|
|
||||
// | 296| Horror| 42| 1|
|
||||
// | 3210| Drama| 97| 1|
|
||||
// | 3414| Action|181| 1|
|
||||
// | 3606| Drama| 33| 1|
|
||||
// | 3959| Comedy| 11| 1|
|
||||
// | 4032| Action|113| 1|
|
||||
// | 467| Romance| 34| 1|
|
||||
// | 4821| Drama|100| 1|
|
||||
// | 4937| Drama|136| 1|
|
||||
// +------+---------+---+---+
|
||||
}
|
||||
|
||||
}
|
||||
//case class movieRecord(var movieId:String,var movieName:String, var movieTypes:String)
|
||||
45
Spark/src/main/scala/com/aisi/spark/TestSparkSql.scala
Normal file
45
Spark/src/main/scala/com/aisi/spark/TestSparkSql.scala
Normal file
@@ -0,0 +1,45 @@
|
||||
package com.aisi.spark
|
||||
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.{DataFrame, SQLContext}
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
import org.apache.spark.sql.functions._ // agg、count函数导入
|
||||
import org.apache.spark.sql.expressions.Window // 导入 Window
|
||||
object TestSparkSql {
|
||||
def main(args: Array[String]): Unit = {
|
||||
val conf = new SparkConf()
|
||||
conf.setMaster("local[*]")
|
||||
conf.setAppName("test sql")
|
||||
val sc = new SparkContext(conf)
|
||||
val sqlSc = new SQLContext(sc)
|
||||
//引入环境信息
|
||||
import sqlSc.implicits._
|
||||
val rdd: RDD[(Int, String, Int, String)] = sc.textFile("Spark/data/a.txt")
|
||||
.map(t => {
|
||||
val line = t.split(" ")
|
||||
val id = line(0).toInt
|
||||
val name = line(1)
|
||||
val age = line(2).toInt
|
||||
val gender = line(3)
|
||||
(id, name, age, gender)
|
||||
})
|
||||
val df: DataFrame = rdd.toDF("id", "name", "age", "gender")
|
||||
df.show() //展示表数据
|
||||
df.printSchema() //展示表格字段信息
|
||||
// df.where("age > 20").groupBy("gender").sum("age").show()
|
||||
// df.orderBy($"age".desc).show()
|
||||
// 聚合并排序
|
||||
// val result = df.groupBy("gender")
|
||||
// .agg(
|
||||
// count("id").as("count_id"), // 计数 id
|
||||
// sum("age").as("sum_age") // 求和 age
|
||||
// )
|
||||
// .orderBy($"sum_age".desc) // 按 sum_age 降序排序
|
||||
// result.show()
|
||||
|
||||
|
||||
df.withColumn("rn",row_number().over(Window.partitionBy("gender").orderBy($"age".desc)))
|
||||
.where("rn = 1")
|
||||
.show()
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user