以下是RNG S8 8强赛失败后,官微发表道歉微博下一级评论
数据说明:
rng_comment.txt文件中的数据
字段 | 字段含义 |
index | 数据id |
child_comment | 回复数量 |
comment_time | 评论时间 |
content | 评论内容 |
da_v | 微博个人认证 |
like_status | 赞 |
pic | 图片评论url |
user_id | 微博用户id |
user_name | 微博用户名 |
vip_rank | 微博会员等级 |
stamp | 时间戳 |
1.1、在kafak中创建rng_comment主题,设置2个分区2个副本
1.2、数据预处理,把空行过滤掉
1.3、请把给出的文件写入到kafka中,根据数据id进行分区,id为奇数的发送到一个分区中,偶数的发送到另一个分区
1.4、使用Spark Streaming对接kafka
1.5、使用Spark Streaming对接kafka之后进行计算
在mysql中创建一个数据库rng_comment
在数据库rng_comment创建vip_rank表,字段为数据的所有字段
在数据库rng_comment创建like_status表,字段为数据的所有字段
在数据库rng_comment创建count_conmment表,字段为 时间,条数
1.5.1、查询出微博会员等级为5的用户,并把这些数据写入到mysql数据库中的vip_rank表中
1.5.2、查询出评论赞的个数在10个以上的数据,并写入到mysql数据库中的like_status表中
1.5.3、分别计算出2018/10/20 ,2018/10/21,2018/10/22,2018/10/23这四天每一天的评论数是多少,并写入到mysql数据库中的count_conmment表中
[success]数据文件:rng_comment[/success]
//bin/kafka-topics.sh --create --zookeeper node01:2181,node02:2181,node03:2181 --replication-factor 2 --partitions 2 --topic rng_comment
object rua01 {
//数据预处理 过滤脏数据 空行滤掉
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("wula")
val sparkContext = new SparkContext(sparkConf)
sparkContext.setLogLevel("warn")
val dataRdd: RDD[String] = sparkContext.textFile("D:\\vedio\\2020\\4月\\04.15\\4.14号练习题\\rng_comment.txt", 1)
println(dataRdd.getNumPartitions)
//rePartition函数 coalesce函数 进行数据紧缩,减少分区数量,将小分区合并为大分区,从而提高效率
//coalesce()方法的参数shuffle默认设置为false
//repartition()方法就是coalesce()方法shuffle为true的情况
dataRdd.filter(x => {
var lineEmpty = true
val lineString: Array[String] = x.split("\t")
if (lineString.length != 11) lineEmpty = false
lineString.foreach(y => if (y.trim == "") lineEmpty = false)
lineEmpty
}
).coalesce(1).saveAsTextFile("processingData")
sparkContext.stop()
}
}
import java.util.Properties
import org.apache.hadoop.hive.metastore.api.Date
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord, RecordMetadata}
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
import java.util
import org.apache.spark.broadcast.Broadcast
object rua02 {
//请把给出的文件写入到kafka中,根据数据id进行分区,id为奇数的发送到一个分区中,偶数的发送到另一个分区
//数据Kafka写入
def main(args: Array[String]): Unit = {
// val sparkConf: SparkConf = new SparkConf().setAppName("wula").setMaster("local[*]")
// val sparkContext = new SparkContext(sparkConf)
// //创建StreamingContext
// val streamingContext = new StreamingContext(sparkConf, Seconds(5))
// //Kafka连接的参数
// val kafkaParams = Map[String, Object](
// "bootstrap.servers" -> "node01:9092,node02:9092,node03:9092",
// "key.deserializer" -> classOf[StringDeserializer],
// "value.deserializer" -> classOf[StringDeserializer],
// "group.id" -> "test",
// //earliest:当各分区下有已提交的offset时,从提交的offset开始消费;无提交的offset时,从头开始消费
// //latest:当各分区下有已提交的offset时,从提交的offset开始消费;无提交的offset时,消费新产生的该分区下的数据
// //none:topic各分区都存在已提交的offset时,从offset后开始消费;只要有一个分区不存在已提交的offset,则抛出异常
// //这里配置latest自动重置偏移量为最新的偏移量,即如果有偏移量从偏移量位置开始消费,没有偏移量从新来的数据开始消费
// "auto.offset.reset" -> "latest",
// //false表示关闭自动提交.由spark帮你提交到Checkpoint或程序员手动维护
// "enable.auto.commit" -> (false: java.lang.Boolean)
// )
// var topics = Array("rng_comment")
// val line = null
// val partition = 0
//--------------------------------------------------- NO!!!!!!!!!!!!!!!!!!!!!!!!!!!
val conf = new SparkConf().setMaster("local[*]").set("spark.serializer", "org.apache.spark.serializer.KryoSerializer").setAppName("SparkToKafka")
val sc = new SparkContext(conf)
val kafkaProducer: Broadcast[KafkaSink[String, String]] = {
val kafkaProducerConfig = {
val p = new Properties()
p.setProperty("bootstrap.servers", "node01:9092,node02:9092,node03:9092")
p.setProperty("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
p.setProperty("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
p
}
sc.broadcast(KafkaSink[String, String](kafkaProducerConfig))
}
val worldRDD = sc.textFile("processingData/part-00000 bk")
worldRDD.foreach(x => {
// Thread.sleep(3)
if (x.toString().split("\t")(0).toInt / 2 % 2 == 0) {
kafkaProducer.value.send("rng_comment", 0, "1", x.toString)
} else {
kafkaProducer.value.send("rng_comment", 0, "1", x.toString)
}
}
)
//--------------------------------------------------- NO!!!!!!!!!!!!!!!!!!!!!!!!!!!
// val sparkConf: SparkConf = new SparkConf().setAppName("wula").setMaster("local[*]")
//
// val sparkContext = new SparkContext(sparkConf)
// val topic = "rng_comment"
// val brokers = "node01:9092,node02:9092,node03:9092"
// val now = new Date()
// val props = new Properties()
// props.put("bootstrap.servers", brokers)
// props.put("client.id", "0")
// props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
// props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
// props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
// props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
// val producer = new KafkaProducer[String, String](props)
// val fileRdd: RDD[String] = sparkContext.textFile("processingData/part-00000")
// fileRdd.foreach(x => {
// val rcd = new ProducerRecord[String, String](topic, x.toString)
// producer.send(rcd)
// })
// sparkContext.stop()
// //结束,否则kafka收不到消息
// producer.close()
}
}
class KafkaSink[K, V](createProducer: () => KafkaProducer[K, V]) extends Serializable {
lazy val producer = createProducer()
def send(topic: String, partition: Int, key: K, value: V): util.concurrent.Future[RecordMetadata] =
producer.send(new ProducerRecord[K, V](topic, partition, key, value))
def send(topic: String, key: K, value: V): util.concurrent.Future[RecordMetadata] =
producer.send(new ProducerRecord[K, V](topic, key, value))
def send(topic: String, value: V): util.concurrent.Future[RecordMetadata] =
producer.send(new ProducerRecord[K, V](topic, value))
}
object KafkaSink {
import scala.collection.JavaConversions._
def apply[K, V](config: Map[String, Object]): KafkaSink[K, V] = {
val createProducerFunc = () => {
val producer = new KafkaProducer[K, V](config)
sys.addShutdownHook {
producer.close()
}
producer
}
new KafkaSink(createProducerFunc)
}
def apply[K, V](config: java.util.Properties): KafkaSink[K, V] = apply(config.toMap)
}
[info]
自定义分区
import org.apache.kafka.clients.producer.Partitioner;
import org.apache.kafka.common.Cluster;
import java.util.Map;
/**
* @version v 1.0
* @Author kami
* @Date 2020/4/23
*/
public class rua02_Java_KafkaPartition implements Partitioner {
//自定义分区
@Override
public int partition(String topic, Object key, byte[] keyBytes, Object value, byte[] valueBytes, Cluster cluster) {
String id = value.toString().split("\t")[0];
int idInt = Integer.parseInt(id);
if (idInt % 2 == 0) {
return 0;
} else {
return 1;
}
}
@Override
public void close() {
}
@Override
public void configure(Map<String, ?> map) {
}
}
[/info]
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.Producer;
import org.apache.kafka.clients.producer.ProducerRecord;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Properties;
/**
* @version v 1.0
* @Author kami
* @Date 2020/4/15
*/
public class rua02_Java {
public static void main(String[] args) throws IOException {
Properties props = new Properties();
props.put("bootstrap.servers", "node01:9092,node02:9092,node03:9092");
props.put("acks", "-1");
props.put("retries", 0);
props.put("batch.size", 16384);
props.put("linger.ms", 1);
props.put("buffer.memory", 33554432);
props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
props.put("partitioner.class", "com.kmai.demo02.rua02_Java_KafkaPartition");
Producer<String, String> producer = new KafkaProducer<>(props);
File inputFile = new File("processingData/part-00000");
BufferedReader bufferedReader = new BufferedReader(new FileReader(inputFile));
String line = null;
// int partition = 0;
// while ((line = bufferedReader.readLine()) != null) {
// try {
// if (Integer.parseInt(line.split("\t")[0]) % 2 == 0) {
// partition = 0;
// } else {
// partition = 1;
// }
// } catch (NumberFormatException e) {
// continue;
// }
// producer.send(new ProducerRecord<String, String>("rng_comment", partition, String.valueOf(partition), line));
// }
while ((line = bufferedReader.readLine()) != null) {
producer.send(new ProducerRecord<String, String>("rng_comment", line));
}
bufferedReader.close();
producer.close();
}
}
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
object rua03 {
def main(args: Array[String]): Unit = {
val sparkConf: SparkConf = new SparkConf().setAppName("wula").setMaster("local[*]")
val sc = new SparkContext(sparkConf)
sc.setLogLevel("warn")
//创建StreamingContext
val ssc = new StreamingContext(sc, Seconds(5))
//准备连接Kafka的参数
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "node01:9092,node02:9092,node03:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "test",
//earliest:当各分区下有已提交的offset时,从提交的offset开始消费;无提交的offset时,从头开始消费
//latest:当各分区下有已提交的offset时,从提交的offset开始消费;无提交的offset时,消费新产生的该分区下的数据
//none:topic各分区都存在已提交的offset时,从offset后开始消费;只要有一个分区不存在已提交的offset,则抛出异常
//这里配置latest自动重置偏移量为最新的偏移量,即如果有偏移量从偏移量位置开始消费,没有偏移量从新来的数据开始消费
"auto.offset.reset" -> "latest",
//false表示关闭自动提交.由spark帮你提交到Checkpoint或程序员手动维护
"enable.auto.commit" -> (false: java.lang.Boolean)
)
val topics = Array("rng_comment")
val kafkaDatas: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](topics, kafkaParams))
//订阅kafkatopic的数据 并处理
val kafkaValues: DStream[String] = kafkaDatas.map(message => message.value())
kafkaValues.print()
//开启任务
ssc.start()
//等待关闭
ssc.awaitTermination()
}
}
import java.sql.{Connection, DriverManager, PreparedStatement}
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
object rua04 {
//1.5.1、查询出微博会员等级为5的用户,并把这些数据写入到mysql数据库中的vip_rank表中
def main(args: Array[String]): Unit = {
// val sparkConf: SparkConf = new SparkConf().setAppName("wula").setMaster("local[*]")
// val streamingContext: StreamingContext = new StreamingContext(sparkConf, Seconds(5))
val sparkConf: SparkConf = new SparkConf().setAppName("wula").setMaster("local[*]")
val sc = new SparkContext(sparkConf)
sc.setLogLevel("warn")
//创建StreamingContext
val streamingContext = new StreamingContext(sc, Seconds(2))
//准备连接Kafka的参数
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "node01:9092,node02:9092,node03:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "test",
//earliest:当各分区下有已提交的offset时,从提交的offset开始消费;无提交的offset时,从头开始消费
//latest:当各分区下有已提交的offset时,从提交的offset开始消费;无提交的offset时,消费新产生的该分区下的数据
//none:topic各分区都存在已提交的offset时,从offset后开始消费;只要有一个分区不存在已提交的offset,则抛出异常
//这里配置latest自动重置偏移量为最新的偏移量,即如果有偏移量从偏移量位置开始消费,没有偏移量从新来的数据开始消费
"auto.offset.reset" -> "latest",
//false表示关闭自动提交.由spark帮你提交到Checkpoint或程序员手动维护
"enable.auto.commit" -> (false: java.lang.Boolean)
)
// 设置Kafka参数
// val kafkaParams: Map[String, Object] = Map[String, Object](
// ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "node01:9092,node02:9092,node03:9092",
// ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
// ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
// ConsumerConfig.GROUP_ID_CONFIG -> "SparkKafka",
// ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "latest",
// ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "earliest", //false表示关闭自动提交.由spark帮你提交到Checkpoint或程序员手动维护
// ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG -> (false: java.lang.Boolean))
val topics = Array("rng_comment")
//接受kafka数据,根据业务逻辑进行计算 //位置策略,源码强烈推荐使用该策略,该策略使Spark的Executor和Kafka的Broker均匀对应 //消费策略,源码强烈推荐使用该策略
val kafkaDatas: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(streamingContext, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](topics, kafkaParams))
//为了达到RDD复用的目的,就需要对想要复用的RDD进行cache,RDD的缓存与释放都是需要我们显示操作的
val kfs: DStream[String] = kafkaDatas.map(_.value()).cache()
// .filter { x =>
// val tx = x.split("\t")
// tx(9) == "5" && tx.length == 11
// }.cache()
kfs.foreachRDD { rdd: RDD[String] =>
rdd.foreachPartition { iter: Iterator[String] => {
//获取mysql连接
val conn: Connection = DriverManager.getConnection("jdbc:mysql://node01:3306/rng_comment?characterEncoding=UTF-8", "root", "123456")
iter.foreach { line: String =>
println(line)
saveToMySQL(line, conn)
}
conn.close()
}
}
}
//开启任务
streamingContext.start()
//等待关闭
streamingContext.awaitTermination()
}
def saveToMySQL(data: String, conn: Connection): Unit = {
//将数据存入到MySQL
val mData = data.split("\t")
if (mData.size == 11 && mData(9) == "5") {
//将每一条数据存入到MySQL
val sql = "insert into vip_rank values (?,?,?,?,?,?,?,?,?,?,?)"
val ps: PreparedStatement = conn.prepareStatement(sql)
ps.setString(1, mData(0))
ps.setString(2, mData(1))
ps.setString(3, mData(2))
ps.setString(4, mData(3))
ps.setString(5, mData(4))
ps.setString(6, mData(5))
ps.setString(7, mData(6))
ps.setString(8, mData(7))
ps.setString(9, mData(8))
ps.setString(10, mData(9))
ps.setString(11, mData(10))
ps.execute() //preparedStatement.addBatch()
ps.executeUpdate()
ps.close()
}
}
// def saveToMySQL(partitionData: Iterator[rng_comment]): Unit = {
// //将数据存入到MySQL
// val conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/rng_comment?characterEncoding=UTF-8", "root", "root")
// partitionData.foreach(data => {
// val sql = "insert into vip_rank values(?,?,?,?,?,?,?,?,?,?,?)"
//
// val ps = conn.prepareCall(sql)
//
// ps.setInt(1, data.index)
// ps.setInt(2, data.child_comment)
// ps.setString(3, data.comment_time)
// ps.setString(4, data.content)
// ps.setInt(5, data.da_v)
// ps.setInt(6, data.like_status)
// ps.setString(7, data.pic)
// ps.setString(8, data.user_id)
// ps.setString(9, data.user_name)
// ps.setInt(10, data.vip_rank)
// ps.setLong(11, data.stamp)
// ps.execute() //preparedStatement.addBatch()
// })
// //ps.executeBatch()
// conn.close()
//
// }
}
import java.sql.{Connection, DriverManager, PreparedStatement}
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
object rua04_Demo {
val driver = "com.mysql.jdbc.Driver"
val url = "jdbc:mysql://node01:3306/rng_comment"
val username = "root"
val password = "123456"
//1.5.1、查询出微博会员等级为5的用户,并把这些数据写入到mysql数据库中的vip_rank表中
def main(args: Array[String]): Unit = {
// val sparkConf: SparkConf = new SparkConf().setAppName("wula").setMaster("local[*]")
// val streamingContext: StreamingContext = new StreamingContext(sparkConf, Seconds(5))
val sparkConf: SparkConf = new SparkConf().setAppName("wula").setMaster("local[*]")
val sc = new SparkContext(sparkConf)
sc.setLogLevel("warn")
//创建StreamingContext
val streamingContext = new StreamingContext(sc, Seconds(2))
//准备连接Kafka的参数
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "node01:9092,node02:9092,node03:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "test",
//earliest:当各分区下有已提交的offset时,从提交的offset开始消费;无提交的offset时,从头开始消费
//latest:当各分区下有已提交的offset时,从提交的offset开始消费;无提交的offset时,消费新产生的该分区下的数据
//none:topic各分区都存在已提交的offset时,从offset后开始消费;只要有一个分区不存在已提交的offset,则抛出异常
//这里配置latest自动重置偏移量为最新的偏移量,即如果有偏移量从偏移量位置开始消费,没有偏移量从新来的数据开始消费
"auto.offset.reset" -> "latest",
//false表示关闭自动提交.由spark帮你提交到Checkpoint或程序员手动维护
"enable.auto.commit" -> (false: java.lang.Boolean)
)
val topics = Array("rng_comment")
//接受kafka数据,根据业务逻辑进行计算 //位置策略,源码强烈推荐使用该策略,该策略使Spark的Executor和Kafka的Broker均匀对应 //消费策略,源码强烈推荐使用该策略
val recordDStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](streamingContext,
LocationStrategies.PreferConsistent, //位置策略,源码强烈推荐使用该策略,会让Spark的Executor和Kafka的Broker均匀对应
ConsumerStrategies.Subscribe[String, String](topics, kafkaParams)) //消费策略,源码强烈推荐使用该策略
val resultDStream: DStream[Array[String]] = recordDStream.map(_.value()).map(_.split("\t")).cache()
resultDStream.filter(_ (9) == "5").foreachRDD {
rdd: RDD[Array[String]] => {
rdd.foreachPartition {
iter: Iterator[Array[String]] => {
Class.forName(driver)
val connection: Connection = DriverManager.getConnection(url, username, password)
var sql = "insert into vip_rank values (?,?,?,?,?,?,?,?,?,?,?)"
iter.foreach {
line: Array[String] => {
val statement: PreparedStatement = connection.prepareStatement(sql)
statement.setString(1, line(0));
statement.setString(2, line(1));
statement.setString(3, line(2));
statement.setString(4, line(3));
statement.setString(5, line(4));
statement.setString(6, line(5));
statement.setString(7, line(6));
statement.setString(8, line(7));
statement.setString(9, line(8));
statement.setString(10, line(9));
statement.setString(11, line(10));
statement.executeUpdate()
statement.close()
}
}
connection.close()
}
}
}
}
//开启任务
streamingContext.start()
//等待关闭
streamingContext.awaitTermination()
}
}
import java.sql.{Connection, DriverManager, PreparedStatement}
import java.util.Properties
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
object rua05_Demo {
//1.5.2、查询出评论赞的个数在10个以上的数据,并写入到mysql数据库中的like_status表中
case class RNG_Comment(index: String, child_comment: String, comment_time: String, content: String, da_v: String, like_status: String, pic: String, user_id: String, user_name: String, vipRank: String, stamp: String)
def main(args: Array[String]): Unit = {
val sparkSession: SparkSession = SparkSession.builder().master("local[*]").appName("wula").getOrCreate()
val sparkContext: SparkContext = sparkSession.sparkContext
sparkContext.setLogLevel("warn")
//创建StreamingContext
val streamingContext = new StreamingContext(sparkContext, Seconds(3))
//准备连接Kafka的参数
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "node01:9092,node02:9092,node03:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "test",
//earliest:当各分区下有已提交的offset时,从提交的offset开始消费;无提交的offset时,从头开始消费
//latest:当各分区下有已提交的offset时,从提交的offset开始消费;无提交的offset时,消费新产生的该分区下的数据
//none:topic各分区都存在已提交的offset时,从offset后开始消费;只要有一个分区不存在已提交的offset,则抛出异常
//这里配置latest自动重置偏移量为最新的偏移量,即如果有偏移量从偏移量位置开始消费,没有偏移量从新来的数据开始消费
"auto.offset.reset" -> "latest",
//false表示关闭自动提交.由spark帮你提交到Checkpoint或程序员手动维护
"enable.auto.commit" -> (false: java.lang.Boolean)
)
val topics = Array("rng_comment")
//接受kafka数据,根据业务逻辑进行计算 //位置策略,源码强烈推荐使用该策略,会让Spark的Executor和Kafka的Broker均匀对应 //消费策略,源码强烈推荐使用该策略
val kafkaDatas: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(streamingContext, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](topics, kafkaParams))
//为了达到RDD复用的目的,就需要对想要复用的RDD进行cache,RDD的缓存与释放都是需要我们显示操作的
val dataD: DStream[String] = kafkaDatas.map(_.value()).cache().filter(data => {
val dataS = data.split("\t")
dataS.size == 11 && dataS(5).toInt > 10
})
val kafkaDataS = dataD.map(x => x.split("\t"))
val commentS: DStream[RNG_Comment] = kafkaDataS.map(x => RNG_Comment(x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9), x(10)))
import sparkSession.implicits._
commentS.foreachRDD(data => {
val props = new Properties()
props.setProperty("user","root")
props.setProperty("password","123456")
val frame: DataFrame = data.toDF()
frame.write.mode(SaveMode.Append).jdbc("jdbc:mysql://node01:3306/rng_comment?characterEncoding=UTF-8","like_status",props)
})
//开启任务
streamingContext.start()
//等待关闭
streamingContext.awaitTermination()
}
}
import java.sql.{Connection, DriverManager, PreparedStatement}
import java.text.SimpleDateFormat
import java.util.Properties
import com.kmai.demo02.rua05_Demo.RNG_Comment
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
object rua06_Demo {
//1.5.3、分别计算出2018/10/20 ,2018/10/21,2018/10/22,2018/10/23这四天每一天的评论数是多少,并写入到mysql数据库中的count_conmment表中
def main(args: Array[String]): Unit = {
val sparkSession: SparkSession = SparkSession.builder().master("local[*]").appName("wula").getOrCreate()
val sparkContext: SparkContext = sparkSession.sparkContext
sparkContext.setLogLevel("warn")
//创建StreamingContext
val streamingContext = new StreamingContext(sparkContext, Seconds(10))
//准备连接Kafka的参数
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "node01:9092,node02:9092,node03:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "test",
//earliest:当各分区下有已提交的offset时,从提交的offset开始消费;无提交的offset时,从头开始消费
//latest:当各分区下有已提交的offset时,从提交的offset开始消费;无提交的offset时,消费新产生的该分区下的数据
//none:topic各分区都存在已提交的offset时,从offset后开始消费;只要有一个分区不存在已提交的offset,则抛出异常
//这里配置latest自动重置偏移量为最新的偏移量,即如果有偏移量从偏移量位置开始消费,没有偏移量从新来的数据开始消费
"auto.offset.reset" -> "latest",
//false表示关闭自动提交.由spark帮你提交到Checkpoint或程序员手动维护
"enable.auto.commit" -> (false: java.lang.Boolean)
)
val topics = Array("rng_comment")
//接受kafka数据,根据业务逻辑进行计算 //位置策略,源码强烈推荐使用该策略,会让Spark的Executor和Kafka的Broker均匀对应 //消费策略,源码强烈推荐使用该策略
val kafkaDatas: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(streamingContext, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](topics, kafkaParams))
//为了达到RDD复用的目的,就需要对想要复用的RDD进行cache,RDD的缓存与释放都是需要我们显示操作的
// val df = new SimpleDateFormat("yyyy/MM/dd hh:mm:ss")
val dataD: DStream[Array[String]] = kafkaDatas.map(_.value()).cache().map(_.split("\t")).cache()
.filter(data => {
data.size == 11 && (data(2).startsWith("2018/10/20") || data(2).startsWith("2018/10/21") || data(2).startsWith("2018/10/22") || data(2).startsWith("2018/10/23"))
})
dataD.foreachRDD {
rdd: RDD[Array[String]] => {
rdd.groupBy(x=>x(2)).map(x => x._1 -> x._2.size).foreachPartition {
iter: Iterator[(String, Int)] => {
val connection: Connection = DriverManager.getConnection("jdbc:mysql://node01:3306/rng_comment?characterEncoding=UTF-8", "root", "123456")
iter.foreach {
line: (String, Int) => {
val statement: PreparedStatement = connection.prepareStatement("insert into count_conmment values (?,?)")
statement.setString(1, line._1);
statement.setString(2, line._2.toString);
statement.executeUpdate()
statement.close()
}
}
connection.close()
}
}
}
}
//开启任务
streamingContext.start()
//等待关闭
streamingContext.awaitTermination()
}
}
文章评论