Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Creating a broadcast variable with SparkSession ? Spark 2.0

Is it possible to create broadcast variables with the sparkContext provided by SparkSession ? I keep getting an error under sc.broadcast , however in a different project when using the SparkContext from org.apache.spark.SparkContext I have no problems.

import org.apache.spark.sql.SparkSession


object MyApp {
 def main(args: Array[String]){
  val spark = SparkSession.builder()
       .appName("My App")
       .master("local[*]")
       .getOrCreate()

  val sc = spark.sparkContext
        .setLogLevel("ERROR")

  val path = "C:\\Boxes\\github-archive\\2015-03-01-0.json"
  val ghLog = spark.read.json(path)


  val pushes = ghLog.filter("type = 'PushEvent'")

  pushes.printSchema()
  println("All events: "+ ghLog.count)
  println("Only pushes: "+pushes.count)
  pushes.show(5)


  val grouped = pushes.groupBy("actor.login").count()
  grouped.show(5)


  val ordered = grouped.orderBy(grouped("count").desc)
  ordered.show(5)

  import scala.io.Source.fromFile
  val fileName= "ghEmployees.txt"
  val employees = Set() ++ ( 
    for { 
      line <- fromFile(fileName).getLines()
    } yield line.trim
    )


  val bcEmployees = sc.broadcast(employees)
 }
}

Or is it a problem of using a Set () instead of a Seq object ?

Thanks for any help

Edit:

I keep getting a "cannot resolve symbol broadcast" error msg in intellij

after complying I get an error of: Error:(47, 28) value broadcast is not a member of Unit val bcEmployees = sc.broadcast(employees) ^

like image 593
ukbaz Avatar asked Jan 05 '23 22:01

ukbaz


1 Answers

Your sc variable has type Unit because, according to the docs, setLogLevel has return type Unit. Do this instead:

val sc: SparkContext = spark.sparkContext
sc.setLogLevel("ERROR")

It is important to keep track of the types of your variables to catch errors earlier.

like image 61
evan.oman Avatar answered Jan 11 '23 06:01

evan.oman