report_umsatz = kunden_segmentiert.groupBy("revenue_segment").agg(
count("*").alias("n_customers"),
spark_round(sum("total_revenue")/1000000, 2).alias("segment_revenue_mill"),
spark_round(avg("total_revenue"), 2).alias("avg_revenue")
).orderBy(desc("segment_revenue_mill"))
# Generate Umsatz prozent column
window_all = Window.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
# Percent of customers
report_umsatz = report_umsatz.withColumn(
"n_customers_percent",
spark_round(col("n_customers") / sum("n_customers").over(window_all)*100, 2))
report_umsatz = report_umsatz.withColumn(
"segment_revenue_percent",
spark_round(col("segment_revenue_mill") / sum("segment_revenue_mill").over(window_all)*100, 2))
# Reorder Spark columns BEFORE toPandas()
report_umsatz_reordered = report_umsatz.select(
"revenue_segment",
"n_customers",
"n_customers_percent",
"segment_revenue_mill",
"avg_revenue",
"segment_revenue_percent"
)
# Convert to table
table_df = (
report_umsatz_reordered.toPandas()
.style.hide(axis="index")
.format({
"segment_revenue_mill": "{:,.2f}",
"avg_revenue": "{:,.2f}",
"segment_revenue_percent": "{:,.2f}",
"n_customers_percent": "{:,.2f}",
})
.set_properties(
subset=["n_customers", "n_customers_percent", "segment_revenue_mill", "avg_revenue", "segment_revenue_percent"],
**{"text-align": "right"}
)
.set_table_styles([
{"selector": "td",
"props": [("padding-right", "8px"), ("padding-left", "8px")]},
{"selector": "th",
"props": [("padding-right", "8px"), ("padding-left", "8px")]}
])
)
table_df