import json
import cachetools
from snowflake.snowpark import Session
from snowflake.snowpark.functions import udf, call_udf, col


connection_parameters = json.load(open('connection.json'))
session = Session.builder.configs(connection_parameters).create()


session.clear_imports()
session.clear_packages()

import imblearn
imblearn_path = imblearn.__path__[0]

session.add_import('@ml_models/model.joblib')
session.add_import(imblearn_path)
session.add_packages('pandas', 'scikit-learn', 'dill')


@cachetools.cached(cache={})
def read_model(filename):
  import dill
  import sys
  import os

  import_dir = sys._xoptions.get("snowflake_import_directory")
  if import_dir:
    with open(os.path.join(import_dir, filename), 'rb') as file:
      m = dill.load(file)
      return m


@udf(
  name='predict_propensity_to_convert',
  session=session,
  replace=True,
  is_permanent=True,
  stage_location='@ml_model_udfs',
  packages=["cachetools", "pandas", "dill", "lightgbm"]
)
def predict_propensity_to_convert(
  first_page_title: str,
  refr_urlhost: str,
  refr_medium: str,
  mkt_medium: str,
  mkt_source: str,
  mkt_term: str,
  mkt_campaign: str,
  engaged_time_in_s: float,
  absolute_time_in_s: float,
  vertical_percentage_scrolled: float,
  geo_country: str,
  geo_region: str,
  br_lang: str,
  device_family: str,
  os_family: str,
) -> float:
  import pandas as pd

  model = read_model('model.joblib')

  df = pd.DataFrame([[
    first_page_title,
    refr_urlhost,
    refr_medium,
    mkt_medium,
    mkt_source,
    mkt_term,
    mkt_campaign,
    engaged_time_in_s,
    absolute_time_in_s,
    vertical_percentage_scrolled,
    geo_country,
    geo_region,
    br_lang,
    device_family,
    os_family]],
    columns=[
      "first_page_title",
      "refr_urlhost",
      "refr_medium",
      "mkt_medium",
      "mkt_source",
      "mkt_term",
      "mkt_campaign",
      "engaged_time_in_s",
      "absolute_time_in_s",
      "vertical_percentage_scrolled",
      "geo_country",
      "geo_region",
      "br_lang",
      "device_family",
      "os_family"]
    )

  prediction = model.predict_proba(df)[0][1]

  return prediction

SELECT
  user_id,
  predict_propensity_to_convert(first_page_title, refr_urlhost, refr_medium, mkt_medium, mkt_source, mkt_term, mkt_campaign, engaged_time_in_s, absolute_time_in_s, vertical_percentage_scrolled, geo_country, geo_region, br_lang, device_family, os_family) AS prediction
FROM FIRST_TOUCH_USER_FEATURES;


import pandas as pd

user_features = session.table('FIRST_TOUCH_USER_FEATURES')

predictions = user_features.select(
  'user_id', 'first_page_title', 'refr_urlhost', 'refr_medium', 'mkt_medium', 'mkt_source', 'mkt_term', 'mkt_campaign', 'engaged_time_in_s',
  'absolute_time_in_s', 'vertical_percentage_scrolled', 'geo_country', 'geo_region', 'br_lang', 'device_family', 'os_family',
  call_udf(
    'predict_propensity_to_convert', col('first_page_title'), col('refr_urlhost'), col('refr_medium'), col('mkt_medium'), col('mkt_source'),
    col('mkt_term'), col('mkt_campaign'), col('engaged_time_in_s'), col('absolute_time_in_s'), col('vertical_percentage_scrolled'),
    col('geo_country'), col('geo_region'), col('br_lang'), col('device_family'), col('os_family')
  ).as_('propensity_score')
)

predictions = predictions.sort(col("propensity_score"), ascending=False)

predictions_df = pd.DataFrame(predictions.collect())

display(predictions_df.head(100))


predictions_df["PROPENSITY_DECILE"] = pd.qcut(predictions_df["PROPENSITY_SCORE"], 10, labels=False)
predictions_df["PROPENSITY_LABEL"] = pd.cut(predictions_df["PROPENSITY_SCORE"], [0., 0.33, 0.66, 1.0], include_lowest=True,
                                            labels=['Low', 'Medium', 'High'])


import plotly.express as px

fig = px.histogram(predictions_df, x="PROPENSITY_SCORE", color="PROPENSITY_LABEL", nbins=50, log_y=True)
fig.show()


# Save scores to table
df = predictions_df[["USER_ID", "PROPENSITY_SCORE", "PROPENSITY_DECILE", "PROPENSITY_LABEL"]]
snowpark_predictions = session.write_pandas(df, "SNOWPLOW_USER_PROPENSITY_SCORES", auto_create_table=True, overwrite=True)

df.head(100)

	USER_ID	FIRST_PAGE_TITLE	REFR_URLHOST	REFR_MEDIUM	MKT_MEDIUM	MKT_SOURCE	MKT_TERM	MKT_CAMPAIGN	ENGAGED_TIME_IN_S	ABSOLUTE_TIME_IN_S	VERTICAL_PERCENTAGE_SCROLLED	GEO_COUNTRY	GEO_REGION	BR_LANG	DEVICE_FAMILY	OS_FAMILY	PROPENSITY_SCORE
0	a3266401b25ae621ac4c47d8c3a702f6	Get started with your Snowplow BDP demo \| Snow...	snowplow.io	internal	None	None	None	None	5	11	34.0	US	IL	en-US	Samsung SM-T227U	Android	1.000000
1	3551fcf45dfa752369daa46310e05084	Book a demo \| Snowplow	www.google.com	search	None	None	None	None	365	78	28.0	US	LA	en-US	Samsung SM-G920V	Android	1.000000
2	a9877f1707fbf5ddaa53a28088e4e3e2	Get started with your Snowplow BDP demo \| Snow...	www.linkedin.com	social	None	None	None	None	10	9	31.0	US	KS	en-US	Other	Windows	1.000000
3	c163c3cbda462b2ff166fcdb4e548c80	Get started with your Snowplow BDP demo \| Snow...	snowplow.io	internal	None	None	None	None	0	0	28.0	IN	None	en-US	Other	Windows	1.000000
4	0ea985ad1166f7388dc6d803c50adc1c	Get in touch with the team at Snowplow \| Snowplow	None	None	None	None	None	None	5	180	35.0	IN	WB	en-US	Other	Windows	1.000000
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
95	aa7d9ef140e7ee34a859780bb488cdb1	Get started with your Snowplow BDP demo \| Snow...	snowplow.io	internal	None	None	None	None	0	0	19.0	CA	QC	en-CA	Samsung SM-A526W	Android	0.999986
96	0f033d5d54663573331217344b70c178	Book a demo \| Snowplow	None	None	None	None	None	None	75	359	23.0	US	WI	en-US	Samsung SM-N976V	Android	0.999985
97	0492ef2286814d6b996a33067e36826a	Get started with your Snowplow BDP demo \| Snow...	None	None	None	None	None	None	0	0	32.0	US	OR	en-US	Mac	Mac OS X	0.999983
98	815e01dd4b919a5def4c9e0b01fc167e	Snowplow: behavioral data creation leader	www.ai-expo.net	unknown	None	None	None	None	130	29	90.0	IN	None	en-US	Other	Windows	0.999982
99	7582890304a929d7344d0e0cf3a2e0de	Get started with your Snowplow BDP demo \| Snow...	com.linkedin.android	unknown	None	None	None	None	5	0	20.0	US	CO	en-US	moto g (60)	Android	0.999981

	USER_ID	PROPENSITY_SCORE	PROPENSITY_DECILE	PROPENSITY_LABEL
0	a3266401b25ae621ac4c47d8c3a702f6	1.000000	9	High
1	3551fcf45dfa752369daa46310e05084	1.000000	9	High
2	a9877f1707fbf5ddaa53a28088e4e3e2	1.000000	9	High
3	c163c3cbda462b2ff166fcdb4e548c80	1.000000	9	High
4	0ea985ad1166f7388dc6d803c50adc1c	1.000000	9	High
...	...	...	...	...
95	aa7d9ef140e7ee34a859780bb488cdb1	0.999986	9	High
96	0f033d5d54663573331217344b70c178	0.999985	9	High
97	0492ef2286814d6b996a33067e36826a	0.999983	9	High
98	815e01dd4b919a5def4c9e0b01fc167e	0.999982	9	High
99	7582890304a929d7344d0e0cf3a2e0de	0.999981	9	High

Propensity to Convert ML Model Usage¶

Configuration¶

Perform predictions¶

Save predictions¶