the fallowing is for spark salting technique
var df1 = Seq((1,"a"),(2,"b"),(1,"c"),(1,"x"),(1,"y"),(1,"g"),(1,"k"),(1,"u"),(1,"n")).toDF("ID","NAME")
df1.createOrReplaceTempView("fact")
var df2 = Seq((1,10),(2,30),(3,40)).toDF("ID","SALARY")
df2.createOrReplaceTempView("dim")
val salted_df1 = spark.sql("""select concat(ID, '_', FLOOR(RAND(123456)*19)) as salted_key, NAME from fact """)
salted_df1.createOrReplaceTempView("salted_fact")
# remember that explode array should start with 0 and end with whatever the values we mentioned in the above flor and rand
val exploded_dim_df = spark.sql(""" select ID, SALARY, explode(array(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19)) as salted_key from dim""")
//val exploded_dim_df = spark.sql(""" select ID, SALARY, explode(array(0 to 19)) as salted_key from dim""")
exploded_dim_df.createOrReplaceTempView("salted_dim")
val result_df = spark.sql("""select split(fact.salted_key, '_')[0] as ID, dim.SALARY
from salted_fact fact
LEFT JOIN salted_dim dim
ON fact.salted_key = concat(dim.ID, '_', dim.salted_key) """)
display(result_df)
No comments:
Post a Comment