Friday, April 8, 2022

spark salting technique explained

 the fallowing is for spark salting technique



var df1 = Seq((1,"a"),(2,"b"),(1,"c"),(1,"x"),(1,"y"),(1,"g"),(1,"k"),(1,"u"),(1,"n")).toDF("ID","NAME")


df1.createOrReplaceTempView("fact")


var df2 = Seq((1,10),(2,30),(3,40)).toDF("ID","SALARY")


df2.createOrReplaceTempView("dim")


val salted_df1 = spark.sql("""select concat(ID, '_', FLOOR(RAND(123456)*19)) as salted_key, NAME from fact """)


salted_df1.createOrReplaceTempView("salted_fact")


# remember that explode array should start with 0 and end with whatever the values we mentioned in the above flor and rand

val exploded_dim_df = spark.sql(""" select ID, SALARY, explode(array(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19)) as salted_key from dim""")


//val exploded_dim_df = spark.sql(""" select ID, SALARY, explode(array(0 to 19)) as salted_key from dim""")


exploded_dim_df.createOrReplaceTempView("salted_dim")


val result_df = spark.sql("""select split(fact.salted_key, '_')[0] as ID, dim.SALARY 

            from salted_fact fact 

            LEFT JOIN salted_dim dim 

            ON fact.salted_key = concat(dim.ID, '_', dim.salted_key) """)

display(result_df)

No comments:

Post a Comment

Recent Post

Databricks Delta table merge Example

here's some sample code that demonstrates a merge operation on a Delta table using PySpark:   from pyspark.sql import SparkSession # cre...