Pyspark - create a new column with concatenating other columns
Pyspark can create a new column with concatenating other columns
1
2
3
4
import pyspark.sql.functions as fn
df = df_raw \
.withColumn('new_col', fn.concat_ws('_', df_raw.col1, df_raw.col2))
Pyspark can also create a json column with other columns
1
2
3
4
5
6
7
import pyspark.sql.functions as fn
from pyspark.sql.functions import to_json, struct
json_columns = ["col1", "col2"]
df = df_raw \
.withColumn('json', fn.to_json(fn.struct([df_raw[x] for x in json_columns])))
Pyspark create a json column with all other columns
1
2
3
4
5
6
import pyspark.sql.functions as fn
from pyspark.sql.functions import to_json, struct
df = df_raw \
.withColumn('json', fn.to_json(fn.struct(fn.col(*))))