You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@spark.apache.org by "Arne Koopman (Jira)" <ji...@apache.org> on 2022/11/03 15:54:00 UTC

[jira] [Updated] (SPARK-41008) Isotonic regression result differs from sklearn implementation

     [ https://issues.apache.org/jira/browse/SPARK-41008?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Arne Koopman updated SPARK-41008:
---------------------------------
    Description: 
 

{{```}}

import pandas as pd
from pyspark.sql.types import DoubleType
from sklearn.isotonic import IsotonicRegression as IsotonicRegression_sklearn
from pyspark.ml.regression import IsotonicRegression as IsotonicRegression_pyspark
 # The P(positives | model_score):
 # 0.6 -> 0.5 (1 out of the 2 labels is positive)
 # 0.333 -> 0.333 (1 out of the 3 labels is positive)
 # 0.20 -> 0.25 (1 out of the 4 labels is positive)
tc_pd = pd.DataFrame(
    \{         "model_score": [0.6, 0.6, 0.333, 0.333, 0.333, 0.20, 0.20, 0.20, 0.20],         "label": [1, 0, 0, 1, 0, 1, 0, 0, 0],         "weight": 1,     }
)

 # The fraction of positives for each of the distinct model_scores would be the best fit.
 # Resulting in the following expected calibrated model_scores:
 # "calibrated_model_score": [0.5, 0.5, 0.333, 0.333, 0.333, 0.25, 0.25, 0.25, 0.25]

 # The sklearn implementation of Isotonic Regression. 
from sklearn.isotonic import IsotonicRegression as IsotonicRegression_sklearn
tc_regressor_sklearn = IsotonicRegression_sklearn().fit(X=tc_pd['model_score'], y=tc_pd['label'], sample_weight=tc_pd['weight'])
print("sklearn:", tc_regressor_sklearn.predict(tc_pd['model_score']))

 # >> sklearn: [0.5 0.5 0.33333333 0.33333333 0.33333333 0.25 0.25 0.25 0.25 ]

 # The pyspark implementation of Isotonic Regression. 
tc_df = spark.createDataFrame(tc_pd)
tc_df = tc_df.withColumn('model_score', F.col('model_score').cast(DoubleType()))

isotonic_regressor_pyspark = IsotonicRegression_pyspark(featuresCol='model_score', labelCol='label', weightCol='weight')
tc_model = isotonic_regressor_pyspark.fit(tc_df)
tc_pd = tc_model.transform(tc_df).toPandas()
print("pyspark:", tc_pd['prediction'].values)
 # >> pyspark: [0.5 0.5 0.33333333 0.33333333 0.33333333 0. 0. 0. 0. ]

 # The result from the pyspark implementation seems unclear. Similar small toy examples lead to similar non-expected results for the pyspark implementation. 

 # Strangely enough, for 'large' datasets, the difference between calibrated model_scores generated by both implementations dissapears.
 # 
{{```}}

  was:
import pandas as pd
from pyspark.sql.types import DoubleType
from sklearn.isotonic import IsotonicRegression as IsotonicRegression_sklearn
from pyspark.ml.regression import IsotonicRegression as IsotonicRegression_pyspark

# The P(positives | model_score):
# 0.6 -> 0.5 (1 out of the 2 labels is positive)
# 0.333 -> 0.333 (1 out of the 3 labels is positive)
# 0.20 -> 0.25 (1 out of the 4 labels is positive)
tc_pd = pd.DataFrame(
    {
        "model_score": [0.6, 0.6, 0.333, 0.333, 0.333, 0.20, 0.20, 0.20, 0.20],
        "label": [1, 0, 0, 1, 0, 1, 0, 0, 0],
        "weight": 1,
    }
)
# The fraction of positives for each of the distinct model_scores would be the best fit.
# Resulting in the following expected calibrated model_scores:
# "calibrated_model_score": [0.5, 0.5, 0.333, 0.333, 0.333, 0.25, 0.25, 0.25, 0.25]

# The sklearn implementation of Isotonic Regression. 
from sklearn.isotonic import IsotonicRegression as IsotonicRegression_sklearn
tc_regressor_sklearn = IsotonicRegression_sklearn().fit(X=tc_pd['model_score'], y=tc_pd['label'], sample_weight=tc_pd['weight'])
print("sklearn:", tc_regressor_sklearn.predict(tc_pd['model_score']))

# >> sklearn: [0.5 0.5 0.33333333 0.33333333 0.33333333 0.25 0.25 0.25 0.25 ]

# The pyspark implementation of Isotonic Regression. 
tc_df = spark.createDataFrame(tc_pd)
tc_df = tc_df.withColumn('model_score', F.col('model_score').cast(DoubleType()))

isotonic_regressor_pyspark = IsotonicRegression_pyspark(featuresCol='model_score', labelCol='label', weightCol='weight')
tc_model = isotonic_regressor_pyspark.fit(tc_df)
tc_pd = tc_model.transform(tc_df).toPandas()
print("pyspark:", tc_pd['prediction'].values)

# >> pyspark: [0.5 0.5 0.33333333 0.33333333 0.33333333 0. 0. 0. 0. ]

# The result from the pyspark implementation seems unclear. Similar small toy examples lead to similar non-expected results for the pyspark implementation. 

# Strangely enough, for 'large' datasets, the difference between calibrated model_scores generated by both implementations dissapears.


> Isotonic regression result differs from sklearn implementation
> --------------------------------------------------------------
>
>                 Key: SPARK-41008
>                 URL: https://issues.apache.org/jira/browse/SPARK-41008
>             Project: Spark
>          Issue Type: Bug
>          Components: MLlib
>    Affects Versions: 3.3.1
>            Reporter: Arne Koopman
>            Priority: Major
>
>  
> {{```}}
> import pandas as pd
> from pyspark.sql.types import DoubleType
> from sklearn.isotonic import IsotonicRegression as IsotonicRegression_sklearn
> from pyspark.ml.regression import IsotonicRegression as IsotonicRegression_pyspark
>  # The P(positives | model_score):
>  # 0.6 -> 0.5 (1 out of the 2 labels is positive)
>  # 0.333 -> 0.333 (1 out of the 3 labels is positive)
>  # 0.20 -> 0.25 (1 out of the 4 labels is positive)
> tc_pd = pd.DataFrame(
>     \{         "model_score": [0.6, 0.6, 0.333, 0.333, 0.333, 0.20, 0.20, 0.20, 0.20],         "label": [1, 0, 0, 1, 0, 1, 0, 0, 0],         "weight": 1,     }
> )
>  # The fraction of positives for each of the distinct model_scores would be the best fit.
>  # Resulting in the following expected calibrated model_scores:
>  # "calibrated_model_score": [0.5, 0.5, 0.333, 0.333, 0.333, 0.25, 0.25, 0.25, 0.25]
>  # The sklearn implementation of Isotonic Regression. 
> from sklearn.isotonic import IsotonicRegression as IsotonicRegression_sklearn
> tc_regressor_sklearn = IsotonicRegression_sklearn().fit(X=tc_pd['model_score'], y=tc_pd['label'], sample_weight=tc_pd['weight'])
> print("sklearn:", tc_regressor_sklearn.predict(tc_pd['model_score']))
>  # >> sklearn: [0.5 0.5 0.33333333 0.33333333 0.33333333 0.25 0.25 0.25 0.25 ]
>  # The pyspark implementation of Isotonic Regression. 
> tc_df = spark.createDataFrame(tc_pd)
> tc_df = tc_df.withColumn('model_score', F.col('model_score').cast(DoubleType()))
> isotonic_regressor_pyspark = IsotonicRegression_pyspark(featuresCol='model_score', labelCol='label', weightCol='weight')
> tc_model = isotonic_regressor_pyspark.fit(tc_df)
> tc_pd = tc_model.transform(tc_df).toPandas()
> print("pyspark:", tc_pd['prediction'].values)
>  # >> pyspark: [0.5 0.5 0.33333333 0.33333333 0.33333333 0. 0. 0. 0. ]
>  # The result from the pyspark implementation seems unclear. Similar small toy examples lead to similar non-expected results for the pyspark implementation. 
>  # Strangely enough, for 'large' datasets, the difference between calibrated model_scores generated by both implementations dissapears.
>  # 
> {{```}}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org