%sh
#wget https://raw.githubusercontent.com/bcbarsness/machine-learning/master/USA_Housing.csv
# USA_Housing.csv数据集包含美国各地区的住房相关信息，包括以下字段：
# - 'Avg. Area Income': 区域平均收入
# - 'Avg. Area House Age': 区域房屋平均年龄
# - 'Avg. Area Number of Rooms': 区域平均房间数
# - 'Avg. Area Number of Bedrooms': 区域平均卧室数
# - 'Area Population': 区域人口
# - 'Price': 房价（目标变量）
# - 'Address': 房屋地址（通常不参与建模）
# 该数据集常用于线性回归等机器学习任务，分析影响房价的因素。

# 导入所需的库
# pandas用于数据处理，numpy用于数值计算，matplotlib和seaborn用于数据可视化
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# read data
USAhousing = pd.read_csv('USA_Housing.csv')

# 设置pandas显示选项：最多显示20行
pd.set_option('display.max_rows', 20)
# 设置pandas显示选项：最多显示20列
pd.set_option('display.max_columns', 20)
# 设置pandas显示选项：每行最多显示300字符宽度
pd.set_option('display.width', 300)
# 显示USAhousing数据集的前5行
USAhousing.head(5)

# 选择用于回归分析的自变量和因变量
# x为“Avg. Area Income”（区域平均收入），y为“Price”（房价）
# 使用seaborn绘制自变量与因变量的散点图，观察两者之间的关系
x = USAhousing[['Avg. Area Income']]
y = USAhousing['Price']
sns.scatterplot(data=USAhousing, x='Avg. Area Income', y='Price')

<Axes: xlabel='Avg. Area Income', ylabel='Price'>

# 导入sklearn库
# 创建线性回归模型并拟合数据
# 输出模型的系数和截距
import sklearn as sk
lm = sk.linear_model.LinearRegression()
lm.fit(x,y)
print(lm.coef_[0], lm.intercept_)

Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

21.19548317193168 -221579.4782059181

# 绘制拟合直线
# m为回归系数（斜率），b为截距
# plt.scatter绘制原始数据点，
# plt.plot绘制回归拟合线
plt.xlabel('Avg. Area Income')
plt.ylabel('Price')
plt.scatter(x,y)
# Y = m * x + b
m = lm.coef_[0]
b = lm.intercept_
plt.plot(x, m*x + b, color='red')

[<matplotlib.lines.Line2D at 0x7fc507b81c10>]

#using seaborn
pdf = USAhousing
sns.regplot(x='Avg. Area Income', y='Price', data=pdf, line_kws={"color": "red"})

<Axes: xlabel='Avg. Area Income', ylabel='Price'>

# predictions
print(f"{lm.predict([[18000]])[0]:.2f}")
print(f"{lm.predict([[40000]])[0]:.2f}")
print(f"{lm.predict([[80000]])[0]:.2f}")
print(f"{lm.predict([[100000]])[0]:.2f}")

/databricks/python/lib/python3.12/site-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  warnings.warn(
/databricks/python/lib/python3.12/site-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  warnings.warn(

159939.22
626239.85
1474059.18

/databricks/python/lib/python3.12/site-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  warnings.warn(
/databricks/python/lib/python3.12/site-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  warnings.warn(

1897968.84

# 使用训练好的线性回归模型 lm 对自变量 x 进行预测，
# 得到预测结果 lm_results
# 将预测结果转换为 DataFrame 以便查看
lm_results = lm.predict(x)
pd.DataFrame(data=lm_results)

# 计算线性回归模型的R^2拟合优度指标
# r2_score用于评估模型对数据的拟合程度，值越接近1表示拟合效果越好
from sklearn.metrics import r2_score
predictions = lm.predict(x)
r2_score(y, predictions)

0.4092593070338846

# 输出线性回归模型的系数、截距、回归方程和R^2拟合优度指标
print(f"Coefficient: {lm.coef_[0]:.2f}")
print(f"Intercept: {lm.intercept_:.2f}")
print(f"Equation Price = {lm.coef_[0]:.2f} * Avg. Area Income + {lm.intercept_:.2f}")
print(f"R^2 fit metric {r2_score(y, predictions):.2f}")

Coefficient: 21.20
Intercept: -221579.48
Equation Price = 21.20 * Avg. Area Income + -221579.48
R^2 fit metric 0.41

# 多变量线性回归
# x_cols为自变量列名列表（去除'Price'和'Address'）
# X为所有自变量组成的数据框，y为因变量（房价）
x_cols = list(set(USAhousing.columns) - {'Price', 'Address'})
X = USAhousing[[*x_cols]]
y = USAhousing['Price']

# 将数据集划分为训练集和测试集，测试集占40%，随机种子为101
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, test_size=0.4, random_state=101)
X_train.head()

# 使用训练集数据拟合多变量线性回归模型
lm = sk.linear_model.LinearRegression()
lm.fit(X_train, y_train)

Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

LinearRegression()

LinearRegression()

# 输出多变量线性回归模型的系数、截距、回归方程
# x_cols为自变量名称列表，lm.coef_为各自变量的回归系数，lm.intercept_为截距
print(f"Coefficient: {x_cols}")
print(f"Coefficient: {lm.coef_}")
print(f"Intercept {lm.intercept_:,.2f}")
print("Equation Price = ")
print(
    f"{lm.intercept_:,.2f} + "
    + " + ".join(
        [f"{x:,.2f} * {c}" for x, c in zip(lm.coef_, x_cols)]
    )
)

Coefficient: ['Avg. Area Number of Bedrooms', 'Avg. Area House Age', 'Area Population', 'Avg. Area Income', 'Avg. Area Number of Rooms']
Coefficient: [2.23380186e+03 1.64883282e+05 1.51504200e+01 2.15282755e+01
 1.22368678e+05]
Intercept -2,640,159.80
Equation Price = 
-2,640,159.80 + 2,233.80 * Avg. Area Number of Bedrooms + 164,883.28 * Avg. Area House Age + 15.15 * Area Population + 21.53 * Avg. Area Income + 122,368.68 * Avg. Area Number of Rooms

# 使用训练好的多变量线性回归模型 lm 对测试集 X_test 进行预测，
# 得到预测结果 y_prediction
# 计算并输出模型在测试集上的R^2拟合优度指标，评估模型性能
from sklearn.metrics import r2_score
y_prediction = lm.predict(X_test)
r2_score(y_test, y_prediction)

0.9176824009649222

# 使用测试集上的真实房价（y_test）和模型预测值（y_prediction）构建DataFrame
# 利用seaborn的lmplot绘制实际值与预测值的回归图，红色线表示拟合效果
model = pd.DataFrame()
model['Price'] = y_test
model['prediction'] = y_prediction
sns.lmplot(data=model, x='Price', y='prediction', line_kws={"color": "red"}, height=5, aspect=1).set(title='actual vs predicted')

<seaborn.axisgrid.FacetGrid at 0x7fc50777c650>

# 使用sklearn的Pipeline简化机器学习流程
# 只需定义步骤列表（如回归器），即可自动完成数据拟合和预测
import sklearn.pipeline
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, test_size=0.4, random_state=101)

steps = [('regressor', sk.linear_model.LinearRegression())]
pipeline = sklearn.pipeline.Pipeline(steps=steps)
fitted_model = pipeline.fit(X_train, y_train)
fitted_model

Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Pipeline(steps=[('regressor', LinearRegression())])

Pipeline(steps=[('regressor', LinearRegression())])

LinearRegression()

# 从pipeline中检索已拟合的模型
# fitted_model.steps是一个包含所有步骤的列表，每个步骤是一个(name, estimator)元组
# lm_name为步骤名称，lm为线性回归模型对象
lm_name = fitted_model.steps[0][0]
lm = fitted_model.steps[0][1]
lm

LinearRegression()

LinearRegression()

# 输出模型的系数、截距和回归方程
# x_cols为自变量名称列表，lm.coef_为各自变量的回归系数，lm.intercept_为截距
print(f"Coefficient: {x_cols}")
print(f"Coefficient: {lm.coef_}")
print(f"Intercept: {lm.intercept_}")
print("Equation Price = ")
print(
    f"{lm.intercept_:,.2f} + "
    + " + ".join(
        [f"{x:,.2f} * {c}" for x, c in zip(lm.coef_, x_cols)]
    )
)

Coefficient: ['Avg. Area Number of Bedrooms', 'Avg. Area House Age', 'Area Population', 'Avg. Area Income', 'Avg. Area Number of Rooms']
Coefficient: [2.23380186e+03 1.64883282e+05 1.51504200e+01 2.15282755e+01
 1.22368678e+05]
Intercept: -2640159.7968529495
Equation Price = 
-2,640,159.80 + 2,233.80 * Avg. Area Number of Bedrooms + 164,883.28 * Avg. Area House Age + 15.15 * Area Population + 21.53 * Avg. Area Income + 122,368.68 * Avg. Area Number of Rooms

# 使用训练好的pipeline模型 fitted_model 对测试集 X_test 进行预测
# r2_score用于评估模型的拟合优度，计算公式为：
# R^2 = 1 - (∑(y_true - y_pred)^2) / (∑(y_true - y_mean)^2)
from sklearn.metrics import r2_score
y_prediction = fitted_model.predict(X_test)
r2_score(y_test, y_prediction)

0.9176824009649222

# 创建一个空的DataFrame用于存储测试数据和预测结果
model = pd.DataFrame()
model['Price'] = y_test
model['prediction'] = y_prediction
sns.lmplot(data=model, x='Price', y='prediction', line_kws={"color": "red"}, height=5, aspect=1).set(title='actual vs predicted')

<seaborn.axisgrid.FacetGrid at 0x7fc4c95e8b30>

import sklearn.preprocessing
# 定义机器学习流水线的处理步骤
# 第一步：数据归一化处理，使用MinMaxScaler将特征缩放到[0,1]范围
# 第二步：使用线性回归模型进行预测
steps = [
    ('normalize', sk.preprocessing.MinMaxScaler()),
    ('regressor', sk.linear_model.LinearRegression())
]
pipeline = sklearn.pipeline.Pipeline(steps=steps)
fitted_model = pipeline.fit(X_train, y_train)
fitted_model

Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Pipeline(steps=[('normalize', MinMaxScaler()),
                ('regressor', LinearRegression())])

Pipeline(steps=[('normalize', MinMaxScaler()),
                ('regressor', LinearRegression())])

MinMaxScaler()

LinearRegression()

# 从训练好的流水线模型中获取第二个步骤的名称
# fitted_model.steps是一个列表，包含了流水线中所有步骤
# 每个步骤是一个元组：(步骤名称, 步骤对象)
# [1]表示获取第二个步骤（索引从0开始），[0]表示获取该步骤的名称
lm_name = fitted_model.steps[1][0]
lm = fitted_model.steps[1][1]
lm

LinearRegression()

LinearRegression()

# model coefficient, intercept (same as above)
print(f"Coefficient: {x_cols}")
print(f"Coefficient: {lm.coef_}")
print(f"Intercept: {lm.intercept_}")
print("Equation Price = ")
print(
    f"{lm.intercept_:,.2f} + "
    + " + ".join(
        [f"{x:,.2f} * {c}" for x, c in zip(lm.coef_, x_cols)]
    )
)

Coefficient: ['Avg. Area Number of Bedrooms', 'Avg. Area House Age', 'Area Population', 'Avg. Area Income', 'Avg. Area Number of Rooms']
Coefficient: [  10052.10838858 1108257.87520464 1005022.2422017  1555354.04803071
  833252.82787088]
Intercept: -877972.9916972378
Equation Price = 
-877,972.99 + 10,052.11 * Avg. Area Number of Bedrooms + 1,108,257.88 * Avg. Area House Age + 1,005,022.24 * Area Population + 1,555,354.05 * Avg. Area Income + 833,252.83 * Avg. Area Number of Rooms

# apply the fitted model to the test data, with r2 fit score
from sklearn.metrics import r2_score

y_prediction = fitted_model.predict(X_test)
r2_score(y_test, y_prediction)

0.9176824009649182

# actual vs predicted values
model = pd.DataFrame()
model['Price'] = y_test
model['prediction'] = y_prediction
sns.lmplot(data=model, x='Price', y='prediction', line_kws={"color": "red"}, height=5, aspect=1).set(title='actual vs predicted')

<seaborn.axisgrid.FacetGrid at 0x7fc4c94ece90>

# pyspark read data
df = spark.read.csv('/Volumes/workspace/usahousing/usahousing/USA_Housing.csv', inferSchema=True, header=True, mode='DROPMALFORMED')
display(df.limit(2))

# rename columns and cast type
for col in df.columns:
    df = df.withColumnRenamed(col, col.replace('.', ''))
for col in df.columns:
    df = df.withColumnRenamed(col, col.replace(' ', '_'))
df = df.withColumn('Avg_Area_Income', df.Avg_Area_Income.cast('double'))
df = df.withColumn('Avg_Area_House_Age', df.Avg_Area_House_Age.cast('double'))
df.printSchema()

root
 |-- Avg_Area_Income: double (nullable = true)
 |-- Avg_Area_House_Age: double (nullable = true)
 |-- Avg_Area_Number_of_Rooms: double (nullable = true)
 |-- Avg_Area_Number_of_Bedrooms: double (nullable = true)
 |-- Area_Population: double (nullable = true)
 |-- Price: double (nullable = true)
 |-- Address: string (nullable = true)

# Clean data
df = df.dropna()
USAhousing = df

from pyspark.ml.feature import VectorAssembler

# similar to this in python
# X = USAhousing[['Avg. Area Income']]
# y = USAhousing['Price']

# Use assembler to create a features column from the independent (X) variables
# VectorAssembler 用于将多个特征列合并为一个向量列，这是大多数 Spark ML 算法所要求的
assembled_data = VectorAssembler(inputCols=['Avg_Area_Income'], outputCol='features')
assembled_data = assembled_data.transform(USAhousing)
display(assembled_data.limit(2))

display(assembled_data.select('Avg_Area_Income', 'features').limit(2))

# scatter plot data
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
sns.set_context("notebook")
sns.set_style('white')
sns.set_palette('bright')
pdf = USAhousing.toPandas()
plt.figure(figsize=(5, 5))
sns.scatterplot(data=pdf, x='Avg_Area_Income', y='Price').set(title='Avg. Area Income vs Price')

[Text(0.5, 1.0, 'Avg. Area Income vs Price')]

# regression
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol='features', labelCol='Price')
lm = lr.fit(assembled_data)
# model coefficient, intercept (same as above)
print(f"Coefficient: {lm.coefficients}")

Downloading artifacts:   0%|          | 0/15 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Coefficient: [21.195483171931965]

# Predictions
from pyspark.ml.linalg import Vectors
print(f"{lm.predict(Vectors.dense([10000])):,.2f}")
print(f"{lm.predict(Vectors.dense([20000])):,.2f}")
print(f"{lm.predict(Vectors.dense([30000])):,.2f}")
print(f"{lm.predict(Vectors.dense([40000])):,.2f}")

-9,624.65
202,330.19
414,285.02
626,239.85

lm_results = lm.transform(assembled_data)
display(lm_results.select('Price', 'prediction').limit(2))

# actual vs predicted values
pdf = lm_results.toPandas()
sns.lmplot(data=pdf, x='Price', y='prediction', line_kws={"color": "red"}, height=5, aspect=1).set(title='actual vs predicted')

<seaborn.axisgrid.FacetGrid at 0x7fc4bd9f9550>

# model coeffients, intercept
print(f"Coefficient: {lm.coefficients}")
print(f"Intercept: {lm.intercept}")
print(f"Equation: Price = {lm.coefficients[0]} * Avg_Area_Income + {lm.intercept}")
print(f"R^2 fit metric {lm.summary.r2:.3f}")

Coefficient: [21.195483171931965]
Intercept: -221579.47820593425
Equation: Price = 21.195483171931965 * Avg_Area_Income + -221579.47820593425
R^2 fit metric 0.409

# fitted line over model
m = lm.coefficients[0]
b = lm.intercept
pdf['predicted_Y'] = m * pdf['Avg_Area_Income'] + b
sns.lmplot(data=pdf, x='Avg_Area_Income', y='Price', line_kws={"color": "red"}, height=5, aspect=1).set(title='Avg. Area Income vs Price')
sns.lineplot(data=pdf, x='Avg_Area_Income', y='predicted_Y', color='red')

<Axes: title={'center': 'Avg. Area Income vs Price'}, xlabel='Avg_Area_Income', ylabel='Price'>

# multiple variable regression
x_cols = list(set(USAhousing.columns) - {'Price', 'Address'})

assembler = VectorAssembler(inputCols=[*x_cols], outputCol='features')
assembled_data = assembler.transform(USAhousing)
display(assembled_data.limit(2))

# split data into training and test sets
train_data, test_data = assembled_data.randomSplit([0.6, 0.4], 24)

# fit the model to the training data
lr = LinearRegression(featuresCol='features', labelCol='Price')
lm = lr.fit(train_data)

Downloading artifacts:   0%|          | 0/15 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

# model coefficient, intercept
print(f"Coefficient: {lm.coefficients}")
print(f"Intercept: {lm.intercept}")
equation = '+'.join([f"{p[0]:,.2f} * {p[1]}" for p in zip(lm.coefficients, x_cols)])
print(f"Equation: Price = {equation} + {lm.intercept}")
print(f"R^2 fit metric {lm.summary.r2:.3f}")

Coefficient: [3605.397252460005,21.689817372329568,119901.1195811148,15.300748157693516,167166.3131438628]
Intercept: -2658556.8356019533
Equation: Price = 3,605.40 * Avg_Area_Number_of_Bedrooms+21.69 * Avg_Area_Income+119,901.12 * Avg_Area_Number_of_Rooms+15.30 * Area_Population+167,166.31 * Avg_Area_House_Age + -2658556.8356019533
R^2 fit metric 0.919

# apply yje fitted model to the test data

lm_results = lm.transform(test_data)
labeld_predictions = lm_results.select('Price', 'prediction')
display(lm_results.select('Price', 'prediction').limit(2))

# evaluate model on test data
from pyspark.ml.evaluation import RegressionEvaluator

regression_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='Price', metricName='r2')
print(f"R^2 on test data = {regression_evaluator.evaluate(lm_results):.3f}")
# multiple independent variable is better

R^2 on test data = 0.916

pdf = labeld_predictions.toPandas()
sns.lmplot(data=pdf, x='Price', y='prediction', line_kws={"color": "red"}, height=5, aspect=1).set(title='actual vs predicted')

<seaborn.axisgrid.FacetGrid at 0x7fc4c171e9c0>

# Machine learning pipeline
# regression using pipeline
from pyspark.ml import Pipeline
train_data, test_data = USAhousing.randomSplit([0.6, 0.4], 24)
pipeline = Pipeline(stages=[assembler, lr])
fitted_model = pipeline.fit(train_data)
fitted_model.stages

Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

[VectorAssembler_58061aa5448a,
 LinearRegressionModel: uid=LinearRegression_3da97fde2f6a, numFeatures=5]

# retreive the fitted model from pipeline
lm = fitted_model.stages[1]
# model coefficientm intercept
print(f"Coefficient: {lm.coefficients}")
print(f"Intercept: {lm.intercept}")
equation = '+'.join([f"{p[0]:,.2f} * {p[1]}" for p in zip(lm.coefficients, x_cols)])
print(f"Equation: Price = {equation} + {lm.intercept}")
print(f"R^2 fit metric {lm.summary.r2:.3f}")

Coefficient: [1998.606677232051,21.502388556887606,120617.74657879196,15.23501171427518,164683.75900259122]
Intercept: -2629038.655070738
Equation: Price = 1,998.61 * Avg_Area_Number_of_Bedrooms+21.50 * Avg_Area_Income+120,617.75 * Avg_Area_Number_of_Rooms+15.24 * Area_Population+164,683.76 * Avg_Area_House_Age + -2629038.655070738
R^2 fit metric 0.917

# aplly the fitted model to the test data
lm_results = fitted_model.transform(test_data)
labeld_predictions = lm_results.select('Price', 'prediction')
display(lm_results.select('Price', 'prediction').limit(2))

# evaluate model on test data
from pyspark.ml.evaluation import RegressionEvaluator

regression_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='Price', metricName='r2')
print(f"R^2 on test data = {regression_evaluator.evaluate(lm_results):.3f}")

R^2 on test data = 0.920

# actual vs predicted values
pdf = labeld_predictions.toPandas()
sns.lmplot(data=pdf, x='Price', y='prediction', line_kws={"color": "red"}, height=5, aspect=1).set(title='actual vs predicted')

<seaborn.axisgrid.FacetGrid at 0x7fc4bdc69f40>

# add additional steps to ML pipeline
from pyspark.ml.feature import StandardScaler
# Run pipeline, fitting the model
train_data, test_data = USAhousing.randomSplit([0.6, 0.4], 24)
pipeline = Pipeline(stages=[VectorAssembler(inputCols=[*x_cols], outputCol='vector_features'),
                            StandardScaler(inputCol='vector_features', outputCol='features'),
                            LinearRegression(labelCol='Price')])
fitted_model = pipeline.fit(train_data)

Downloading artifacts:   0%|          | 0/30 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

# Pipeline stages from model
fitted_model.stages

[VectorAssembler_a23ce974de75,
 StandardScalerModel: uid=StandardScaler_478fe328a565, numFeatures=5, withMean=false, withStd=true,
 LinearRegressionModel: uid=LinearRegression_29ff763dc53a, numFeatures=5]

# Retreive the fitted model from pipeline
lm = fitted_model.stages[2]
# model coefficientm intercept
print(f"Coefficient: {lm.coefficients}")
print(f"Intercept: {lm.intercept}")
equation = '+'.join([f"{p[0]:,.2f} * {p[1]}" for p in zip(lm.coefficients, x_cols)])
print(f"Equation: Price = {equation} + {lm.intercept}")
print(f"R^2 fit metric {lm.summary.r2:.3f}")

Coefficient: [2470.3928711735416,228321.8648034716,121105.44836963713,151370.824287895,163367.2436056838]
Intercept: -2629038.655069568
Equation: Price = 2,470.39 * Avg_Area_Number_of_Bedrooms+228,321.86 * Avg_Area_Income+121,105.45 * Avg_Area_Number_of_Rooms+151,370.82 * Area_Population+163,367.24 * Avg_Area_House_Age + -2629038.655069568
R^2 fit metric 0.917

# Apply the fitted model to the test data
lm_results = fitted_model.transform(test_data)
labeled_predictions = lm_results.select('Price', 'prediction')
display(lm_results.select('Price', 'prediction').limit(2))

# evaluate model on test data
from pyspark.ml.evaluation import RegressionEvaluator

regression_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='Price', metricName='r2')
print(f"R^2 on test data = {regression_evaluator.evaluate(lm_results):.3f}")

R^2 on test data = 0.920

# actual vs predicted values
pdf = labeld_predictions.toPandas()
sns.lmplot(data=pdf, x='Price', y='prediction', line_kws={"color": "red"}, height=5, aspect=1).set(title='actual vs predicted')

<seaborn.axisgrid.FacetGrid at 0x7fc4bdb783b0>

# RFormula 提供了一种简洁的方式来指定特征转换和建模公式
#simplifying with RFormula
from pyspark.ml.feature import RFormula, StandardScaler
# Run pipeline, fitting the model
train_data, test_data = USAhousing.randomSplit([0.6, 0.4], 24)
x_cols = list(set(USAhousing.columns) - {'Price', 'Address'})
formula = "{} ~ {}".format('Price', ' + '.join(x_cols))
print("Formula:{}".format(formula))

pipeline = Pipeline(stages=[RFormula(formula=formula),
                            StandardScaler(inputCol='features', outputCol='scaled_features'),
                            LinearRegression(labelCol='Price')])
fitted_model = pipeline.fit(train_data)

Formula:Price ~ Avg_Area_Number_of_Bedrooms + Avg_Area_Income + Avg_Area_Number_of_Rooms + Area_Population + Avg_Area_House_Age

Downloading artifacts:   0%|          | 0/65 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

# retreieve the fitted model from pipeline
lm = fitted_model.stages[2]
# model coefficientm intercept
print(f"Coefficient: {lm.coefficients}")
print(f"Intercept: {lm.intercept}")
equation = '+'.join([f"{p[0]:,.2f} * {p[1]}" for p in zip(lm.coefficients, x_cols)])
print(f"Equation: Price = {equation} + {lm.intercept}")
print(f"R^2 fit metric {lm.summary.r2:.3f}")

Coefficient: [1998.606677232051,21.502388556887606,120617.74657879196,15.23501171427518,164683.75900259122]
Intercept: -2629038.655070738
Equation: Price = 1,998.61 * Avg_Area_Number_of_Bedrooms+21.50 * Avg_Area_Income+120,617.75 * Avg_Area_Number_of_Rooms+15.24 * Area_Population+164,683.76 * Avg_Area_House_Age + -2629038.655070738
R^2 fit metric 0.917

# Apply the fitted model to the test data
lm_results = fitted_model.transform(test_data)
labeled_predictions = lm_results.select('Price', 'prediction')
display(lm_results.select('Price', 'prediction').limit(2))

# evaluate model on test data
from pyspark.ml.evaluation import RegressionEvaluator

regression_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='Price', metricName='r2')
print(f"R^2 on test data = {regression_evaluator.evaluate(lm_results):.3f}")

R^2 on test data = 0.920

pdf = labeld_predictions.toPandas()
sns.lmplot(data=pdf, x='Price', y='prediction', line_kws={"color": "red"}, height=5, aspect=1).set(title='actual vs predicted')

<seaborn.axisgrid.FacetGrid at 0x7fc4c2baf1a0>

	Avg. Area Income	Avg. Area House Age	Avg. Area Number of Rooms	Avg. Area Number of Bedrooms	Area Population	Price	Address
0	79545.458574	5.682861	7.009188	4.09	23086.800503	1.059034e+06	208 Michael Ferry Apt. 674\nLaurabury, NE 3701...
1	79248.642455	6.002900	6.730821	3.09	40173.072174	1.505891e+06	188 Johnson Views Suite 079\nLake Kathleen, CA...
2	61287.067179	5.865890	8.512727	5.13	36882.159400	1.058988e+06	9127 Elizabeth Stravenue\nDanieltown, WI 06482...
3	63345.240046	7.188236	5.586729	3.26	34310.242831	1.260617e+06	USS Barnett\nFPO AP 44820
4	59982.197226	5.040555	7.839388	4.23	26354.109472	6.309435e+05	USNS Raymond\nFPO AE 09386

	0
0	1.464425e+06
1	1.458134e+06
2	1.077430e+06
3	1.121053e+06
4	1.049772e+06
...	...
4995	1.062187e+06
4996	1.442081e+06
4997	1.122017e+06
4998	1.219742e+06
4999	1.166949e+06

	Avg. Area Number of Bedrooms	Avg. Area House Age	Area Population	Avg. Area Income	Avg. Area Number of Rooms
1303	3.10	5.364208	44557.379656	68091.179676	7.502956
1051	4.21	5.580599	29996.018448	75729.765546	7.642973
4904	5.42	6.358747	38627.301473	70885.420819	7.250241
931	4.30	4.966360	38413.490484	73386.407340	7.915453
4976	5.23	5.351169	34107.888619	75046.313791	7.797825

Avg. Area Income	Avg. Area House Age	Avg. Area Number of Rooms	Avg. Area Number of Bedrooms	Area Population	Price	Address
79545.45857431678	5.682861321615587	7.009188142792237	4.09	23086.800502686456	1059033.5578701235	208 Michael Ferry Apt. 674
79248.64245482568	6.0028998082752425	6.730821019094919	3.09	40173.07217364482	1505890.91484695	188 Johnson Views Suite 079

Avg_Area_Income	Avg_Area_House_Age	Avg_Area_Number_of_Rooms	Avg_Area_Number_of_Bedrooms	Area_Population	Price	Address	features
79545.45857431678	5.682861321615587	7.009188142792237	4.09	23086.800502686456	1059033.5578701235	208 Michael Ferry Apt. 674	Map(vectorType -> dense, length -> 1, values -> List(79545.45857431678))
79248.64245482568	6.0028998082752425	6.730821019094919	3.09	40173.07217364482	1505890.91484695	188 Johnson Views Suite 079	Map(vectorType -> dense, length -> 1, values -> List(79248.64245482568))

大数据分析与挖掘¶

03. Linear regression in Python and Pyspark ¶

Simple one variable regression¶

multiple variable regression¶

Machine Learning Pippeline¶

使用pipeline简化复杂数据处理¶

Pyspark regression¶

Simple one variable regression¶

multiple variable regression¶

pipeline¶

Complex data processing simplified with pipeline¶

Price	prediction
1059033.5578701235	1464424.9504096084
1505890.91484695	1458133.7893437766

Price	prediction
31140.517620186045	99409.9178372696
723750.0652577134	572142.5458968766

Price	prediction
1077805.577726322	905856.8198512727
299863.0401311839	382459.18199658114

Price	prediction
1077805.577726322	905856.8198512159
299863.0401311839	382459.1819968051