In [0]:
%sh
#wget https://raw.githubusercontent.com/bcbarsness/machine-learning/master/USA_Housing.csv
# USA_Housing.csv数据集包含美国各地区的住房相关信息,包括以下字段:
# - 'Avg. Area Income': 区域平均收入
# - 'Avg. Area House Age': 区域房屋平均年龄
# - 'Avg. Area Number of Rooms': 区域平均房间数
# - 'Avg. Area Number of Bedrooms': 区域平均卧室数
# - 'Area Population': 区域人口
# - 'Price': 房价(目标变量)
# - 'Address': 房屋地址(通常不参与建模)
# 该数据集常用于线性回归等机器学习任务,分析影响房价的因素。
In [0]:
# 导入所需的库
# pandas用于数据处理,numpy用于数值计算,matplotlib和seaborn用于数据可视化
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# read data
USAhousing = pd.read_csv('USA_Housing.csv')
In [0]:
# 设置pandas显示选项:最多显示20行
pd.set_option('display.max_rows', 20)
# 设置pandas显示选项:最多显示20列
pd.set_option('display.max_columns', 20)
# 设置pandas显示选项:每行最多显示300字符宽度
pd.set_option('display.width', 300)
# 显示USAhousing数据集的前5行
USAhousing.head(5)
Out[0]:
| Avg. Area Income | Avg. Area House Age | Avg. Area Number of Rooms | Avg. Area Number of Bedrooms | Area Population | Price | Address | |
|---|---|---|---|---|---|---|---|
| 0 | 79545.458574 | 5.682861 | 7.009188 | 4.09 | 23086.800503 | 1.059034e+06 | 208 Michael Ferry Apt. 674\nLaurabury, NE 3701... |
| 1 | 79248.642455 | 6.002900 | 6.730821 | 3.09 | 40173.072174 | 1.505891e+06 | 188 Johnson Views Suite 079\nLake Kathleen, CA... |
| 2 | 61287.067179 | 5.865890 | 8.512727 | 5.13 | 36882.159400 | 1.058988e+06 | 9127 Elizabeth Stravenue\nDanieltown, WI 06482... |
| 3 | 63345.240046 | 7.188236 | 5.586729 | 3.26 | 34310.242831 | 1.260617e+06 | USS Barnett\nFPO AP 44820 |
| 4 | 59982.197226 | 5.040555 | 7.839388 | 4.23 | 26354.109472 | 6.309435e+05 | USNS Raymond\nFPO AE 09386 |
Simple one variable regression¶
- using sklearn
In [0]:
# 选择用于回归分析的自变量和因变量
# x为“Avg. Area Income”(区域平均收入),y为“Price”(房价)
# 使用seaborn绘制自变量与因变量的散点图,观察两者之间的关系
x = USAhousing[['Avg. Area Income']]
y = USAhousing['Price']
sns.scatterplot(data=USAhousing, x='Avg. Area Income', y='Price')
Out[0]:
<Axes: xlabel='Avg. Area Income', ylabel='Price'>
In [0]:
# 导入sklearn库
# 创建线性回归模型并拟合数据
# 输出模型的系数和截距
import sklearn as sk
lm = sk.linear_model.LinearRegression()
lm.fit(x,y)
print(lm.coef_[0], lm.intercept_)
Uploading artifacts: 0%| | 0/9 [00:00<?, ?it/s]
21.19548317193168 -221579.4782059181
In [0]:
# 绘制拟合直线
# m为回归系数(斜率),b为截距
# plt.scatter绘制原始数据点,
# plt.plot绘制回归拟合线
plt.xlabel('Avg. Area Income')
plt.ylabel('Price')
plt.scatter(x,y)
# Y = m * x + b
m = lm.coef_[0]
b = lm.intercept_
plt.plot(x, m*x + b, color='red')
Out[0]:
[<matplotlib.lines.Line2D at 0x7fc507b81c10>]
In [0]:
#using seaborn
pdf = USAhousing
sns.regplot(x='Avg. Area Income', y='Price', data=pdf, line_kws={"color": "red"})
Out[0]:
<Axes: xlabel='Avg. Area Income', ylabel='Price'>
In [0]:
# predictions
print(f"{lm.predict([[18000]])[0]:.2f}")
print(f"{lm.predict([[40000]])[0]:.2f}")
print(f"{lm.predict([[80000]])[0]:.2f}")
print(f"{lm.predict([[100000]])[0]:.2f}")
/databricks/python/lib/python3.12/site-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names warnings.warn( /databricks/python/lib/python3.12/site-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names warnings.warn(
159939.22 626239.85 1474059.18
/databricks/python/lib/python3.12/site-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names warnings.warn( /databricks/python/lib/python3.12/site-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names warnings.warn(
1897968.84
In [0]:
# 使用训练好的线性回归模型 lm 对自变量 x 进行预测,
# 得到预测结果 lm_results
# 将预测结果转换为 DataFrame 以便查看
lm_results = lm.predict(x)
pd.DataFrame(data=lm_results)
Out[0]:
| 0 | |
|---|---|
| 0 | 1.464425e+06 |
| 1 | 1.458134e+06 |
| 2 | 1.077430e+06 |
| 3 | 1.121053e+06 |
| 4 | 1.049772e+06 |
| ... | ... |
| 4995 | 1.062187e+06 |
| 4996 | 1.442081e+06 |
| 4997 | 1.122017e+06 |
| 4998 | 1.219742e+06 |
| 4999 | 1.166949e+06 |
5000 rows × 1 columns
In [0]:
# 计算线性回归模型的R^2拟合优度指标
# r2_score用于评估模型对数据的拟合程度,值越接近1表示拟合效果越好
from sklearn.metrics import r2_score
predictions = lm.predict(x)
r2_score(y, predictions)
Out[0]:
0.4092593070338846
In [0]:
# 输出线性回归模型的系数、截距、回归方程和R^2拟合优度指标
print(f"Coefficient: {lm.coef_[0]:.2f}")
print(f"Intercept: {lm.intercept_:.2f}")
print(f"Equation Price = {lm.coef_[0]:.2f} * Avg. Area Income + {lm.intercept_:.2f}")
print(f"R^2 fit metric {r2_score(y, predictions):.2f}")
Coefficient: 21.20 Intercept: -221579.48 Equation Price = 21.20 * Avg. Area Income + -221579.48 R^2 fit metric 0.41
multiple variable regression¶
In [0]:
# 多变量线性回归
# x_cols为自变量列名列表(去除'Price'和'Address')
# X为所有自变量组成的数据框,y为因变量(房价)
x_cols = list(set(USAhousing.columns) - {'Price', 'Address'})
X = USAhousing[[*x_cols]]
y = USAhousing['Price']
In [0]:
# 将数据集划分为训练集和测试集,测试集占40%,随机种子为101
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, test_size=0.4, random_state=101)
X_train.head()
Out[0]:
| Avg. Area Number of Bedrooms | Avg. Area House Age | Area Population | Avg. Area Income | Avg. Area Number of Rooms | |
|---|---|---|---|---|---|
| 1303 | 3.10 | 5.364208 | 44557.379656 | 68091.179676 | 7.502956 |
| 1051 | 4.21 | 5.580599 | 29996.018448 | 75729.765546 | 7.642973 |
| 4904 | 5.42 | 6.358747 | 38627.301473 | 70885.420819 | 7.250241 |
| 931 | 4.30 | 4.966360 | 38413.490484 | 73386.407340 | 7.915453 |
| 4976 | 5.23 | 5.351169 | 34107.888619 | 75046.313791 | 7.797825 |
In [0]:
# 使用训练集数据拟合多变量线性回归模型
lm = sk.linear_model.LinearRegression()
lm.fit(X_train, y_train)
Uploading artifacts: 0%| | 0/9 [00:00<?, ?it/s]
Out[0]:
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [0]:
# 输出多变量线性回归模型的系数、截距、回归方程
# x_cols为自变量名称列表,lm.coef_为各自变量的回归系数,lm.intercept_为截距
print(f"Coefficient: {x_cols}")
print(f"Coefficient: {lm.coef_}")
print(f"Intercept {lm.intercept_:,.2f}")
print("Equation Price = ")
print(
f"{lm.intercept_:,.2f} + "
+ " + ".join(
[f"{x:,.2f} * {c}" for x, c in zip(lm.coef_, x_cols)]
)
)
Coefficient: ['Avg. Area Number of Bedrooms', 'Avg. Area House Age', 'Area Population', 'Avg. Area Income', 'Avg. Area Number of Rooms'] Coefficient: [2.23380186e+03 1.64883282e+05 1.51504200e+01 2.15282755e+01 1.22368678e+05] Intercept -2,640,159.80 Equation Price = -2,640,159.80 + 2,233.80 * Avg. Area Number of Bedrooms + 164,883.28 * Avg. Area House Age + 15.15 * Area Population + 21.53 * Avg. Area Income + 122,368.68 * Avg. Area Number of Rooms
In [0]:
# 使用训练好的多变量线性回归模型 lm 对测试集 X_test 进行预测,
# 得到预测结果 y_prediction
# 计算并输出模型在测试集上的R^2拟合优度指标,评估模型性能
from sklearn.metrics import r2_score
y_prediction = lm.predict(X_test)
r2_score(y_test, y_prediction)
Out[0]:
0.9176824009649222
In [0]:
# 使用测试集上的真实房价(y_test)和模型预测值(y_prediction)构建DataFrame
# 利用seaborn的lmplot绘制实际值与预测值的回归图,红色线表示拟合效果
model = pd.DataFrame()
model['Price'] = y_test
model['prediction'] = y_prediction
sns.lmplot(data=model, x='Price', y='prediction', line_kws={"color": "red"}, height=5, aspect=1).set(title='actual vs predicted')
Out[0]:
<seaborn.axisgrid.FacetGrid at 0x7fc50777c650>
Machine Learning Pippeline¶
- regression using pipeline
In [0]:
# 使用sklearn的Pipeline简化机器学习流程
# 只需定义步骤列表(如回归器),即可自动完成数据拟合和预测
import sklearn.pipeline
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, test_size=0.4, random_state=101)
steps = [('regressor', sk.linear_model.LinearRegression())]
pipeline = sklearn.pipeline.Pipeline(steps=steps)
fitted_model = pipeline.fit(X_train, y_train)
fitted_model
Uploading artifacts: 0%| | 0/9 [00:00<?, ?it/s]
Out[0]:
Pipeline(steps=[('regressor', LinearRegression())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('regressor', LinearRegression())])LinearRegression()
In [0]:
# 从pipeline中检索已拟合的模型
# fitted_model.steps是一个包含所有步骤的列表,每个步骤是一个(name, estimator)元组
# lm_name为步骤名称,lm为线性回归模型对象
lm_name = fitted_model.steps[0][0]
lm = fitted_model.steps[0][1]
lm
Out[0]:
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [0]:
# 输出模型的系数、截距和回归方程
# x_cols为自变量名称列表,lm.coef_为各自变量的回归系数,lm.intercept_为截距
print(f"Coefficient: {x_cols}")
print(f"Coefficient: {lm.coef_}")
print(f"Intercept: {lm.intercept_}")
print("Equation Price = ")
print(
f"{lm.intercept_:,.2f} + "
+ " + ".join(
[f"{x:,.2f} * {c}" for x, c in zip(lm.coef_, x_cols)]
)
)
Coefficient: ['Avg. Area Number of Bedrooms', 'Avg. Area House Age', 'Area Population', 'Avg. Area Income', 'Avg. Area Number of Rooms'] Coefficient: [2.23380186e+03 1.64883282e+05 1.51504200e+01 2.15282755e+01 1.22368678e+05] Intercept: -2640159.7968529495 Equation Price = -2,640,159.80 + 2,233.80 * Avg. Area Number of Bedrooms + 164,883.28 * Avg. Area House Age + 15.15 * Area Population + 21.53 * Avg. Area Income + 122,368.68 * Avg. Area Number of Rooms
In [0]:
# 使用训练好的pipeline模型 fitted_model 对测试集 X_test 进行预测
# r2_score用于评估模型的拟合优度,计算公式为:
# R^2 = 1 - (∑(y_true - y_pred)^2) / (∑(y_true - y_mean)^2)
from sklearn.metrics import r2_score
y_prediction = fitted_model.predict(X_test)
r2_score(y_test, y_prediction)
Out[0]:
0.9176824009649222
In [0]:
# 创建一个空的DataFrame用于存储测试数据和预测结果
model = pd.DataFrame()
model['Price'] = y_test
model['prediction'] = y_prediction
sns.lmplot(data=model, x='Price', y='prediction', line_kws={"color": "red"}, height=5, aspect=1).set(title='actual vs predicted')
Out[0]:
<seaborn.axisgrid.FacetGrid at 0x7fc4c95e8b30>
In [0]:
import sklearn.preprocessing
# 定义机器学习流水线的处理步骤
# 第一步:数据归一化处理,使用MinMaxScaler将特征缩放到[0,1]范围
# 第二步:使用线性回归模型进行预测
steps = [
('normalize', sk.preprocessing.MinMaxScaler()),
('regressor', sk.linear_model.LinearRegression())
]
pipeline = sklearn.pipeline.Pipeline(steps=steps)
fitted_model = pipeline.fit(X_train, y_train)
fitted_model
Uploading artifacts: 0%| | 0/9 [00:00<?, ?it/s]
Out[0]:
Pipeline(steps=[('normalize', MinMaxScaler()),
('regressor', LinearRegression())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('normalize', MinMaxScaler()),
('regressor', LinearRegression())])MinMaxScaler()
LinearRegression()
In [0]:
# 从训练好的流水线模型中获取第二个步骤的名称
# fitted_model.steps是一个列表,包含了流水线中所有步骤
# 每个步骤是一个元组:(步骤名称, 步骤对象)
# [1]表示获取第二个步骤(索引从0开始),[0]表示获取该步骤的名称
lm_name = fitted_model.steps[1][0]
lm = fitted_model.steps[1][1]
lm
Out[0]:
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [0]:
# model coefficient, intercept (same as above)
print(f"Coefficient: {x_cols}")
print(f"Coefficient: {lm.coef_}")
print(f"Intercept: {lm.intercept_}")
print("Equation Price = ")
print(
f"{lm.intercept_:,.2f} + "
+ " + ".join(
[f"{x:,.2f} * {c}" for x, c in zip(lm.coef_, x_cols)]
)
)
Coefficient: ['Avg. Area Number of Bedrooms', 'Avg. Area House Age', 'Area Population', 'Avg. Area Income', 'Avg. Area Number of Rooms'] Coefficient: [ 10052.10838858 1108257.87520464 1005022.2422017 1555354.04803071 833252.82787088] Intercept: -877972.9916972378 Equation Price = -877,972.99 + 10,052.11 * Avg. Area Number of Bedrooms + 1,108,257.88 * Avg. Area House Age + 1,005,022.24 * Area Population + 1,555,354.05 * Avg. Area Income + 833,252.83 * Avg. Area Number of Rooms
In [0]:
# apply the fitted model to the test data, with r2 fit score
from sklearn.metrics import r2_score
y_prediction = fitted_model.predict(X_test)
r2_score(y_test, y_prediction)
Out[0]:
0.9176824009649182
In [0]:
# actual vs predicted values
model = pd.DataFrame()
model['Price'] = y_test
model['prediction'] = y_prediction
sns.lmplot(data=model, x='Price', y='prediction', line_kws={"color": "red"}, height=5, aspect=1).set(title='actual vs predicted')
Out[0]:
<seaborn.axisgrid.FacetGrid at 0x7fc4c94ece90>
使用pipeline简化复杂数据处理¶
- 由于上面的示例只有两个步骤,因此并未展示pipeline的优势
- 例如,自然语言处理所需的多个步骤可以通过pipeline得到简化
Pyspark regression¶
- one independent variable
- multiple independent variables
- simple pipeline
In [0]:
# pyspark read data
df = spark.read.csv('/Volumes/workspace/usahousing/usahousing/USA_Housing.csv', inferSchema=True, header=True, mode='DROPMALFORMED')
display(df.limit(2))
| Avg. Area Income | Avg. Area House Age | Avg. Area Number of Rooms | Avg. Area Number of Bedrooms | Area Population | Price | Address |
|---|---|---|---|---|---|---|
| 79545.45857431678 | 5.682861321615587 | 7.009188142792237 | 4.09 | 23086.800502686456 | 1059033.5578701235 | 208 Michael Ferry Apt. 674 |
| 79248.64245482568 | 6.0028998082752425 | 6.730821019094919 | 3.09 | 40173.07217364482 | 1505890.91484695 | 188 Johnson Views Suite 079 |
In [0]:
# rename columns and cast type
for col in df.columns:
df = df.withColumnRenamed(col, col.replace('.', ''))
for col in df.columns:
df = df.withColumnRenamed(col, col.replace(' ', '_'))
df = df.withColumn('Avg_Area_Income', df.Avg_Area_Income.cast('double'))
df = df.withColumn('Avg_Area_House_Age', df.Avg_Area_House_Age.cast('double'))
df.printSchema()
root |-- Avg_Area_Income: double (nullable = true) |-- Avg_Area_House_Age: double (nullable = true) |-- Avg_Area_Number_of_Rooms: double (nullable = true) |-- Avg_Area_Number_of_Bedrooms: double (nullable = true) |-- Area_Population: double (nullable = true) |-- Price: double (nullable = true) |-- Address: string (nullable = true)
In [0]:
# Clean data
df = df.dropna()
USAhousing = df
Simple one variable regression¶
- using pyspark MLlib
- default predictor column is labels
- feature columns are placed into one vectorized column called features
In [0]:
from pyspark.ml.feature import VectorAssembler
# similar to this in python
# X = USAhousing[['Avg. Area Income']]
# y = USAhousing['Price']
# Use assembler to create a features column from the independent (X) variables
# VectorAssembler 用于将多个特征列合并为一个向量列,这是大多数 Spark ML 算法所要求的
assembled_data = VectorAssembler(inputCols=['Avg_Area_Income'], outputCol='features')
assembled_data = assembled_data.transform(USAhousing)
display(assembled_data.limit(2))
| Avg_Area_Income | Avg_Area_House_Age | Avg_Area_Number_of_Rooms | Avg_Area_Number_of_Bedrooms | Area_Population | Price | Address | features |
|---|---|---|---|---|---|---|---|
| 79545.45857431678 | 5.682861321615587 | 7.009188142792237 | 4.09 | 23086.800502686456 | 1059033.5578701235 | 208 Michael Ferry Apt. 674 | Map(vectorType -> dense, length -> 1, values -> List(79545.45857431678)) |
| 79248.64245482568 | 6.0028998082752425 | 6.730821019094919 | 3.09 | 40173.07217364482 | 1505890.91484695 | 188 Johnson Views Suite 079 | Map(vectorType -> dense, length -> 1, values -> List(79248.64245482568)) |
In [0]:
display(assembled_data.select('Avg_Area_Income', 'features').limit(2))
| Avg_Area_Income | features |
|---|---|
| 79545.45857431678 | Map(vectorType -> dense, length -> 1, values -> List(79545.45857431678)) |
| 79248.64245482568 | Map(vectorType -> dense, length -> 1, values -> List(79248.64245482568)) |
In [0]:
# scatter plot data
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_context("notebook")
sns.set_style('white')
sns.set_palette('bright')
pdf = USAhousing.toPandas()
plt.figure(figsize=(5, 5))
sns.scatterplot(data=pdf, x='Avg_Area_Income', y='Price').set(title='Avg. Area Income vs Price')
Out[0]:
[Text(0.5, 1.0, 'Avg. Area Income vs Price')]
In [0]:
# regression
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol='features', labelCol='Price')
lm = lr.fit(assembled_data)
# model coefficient, intercept (same as above)
print(f"Coefficient: {lm.coefficients}")
Downloading artifacts: 0%| | 0/15 [00:00<?, ?it/s]
Uploading artifacts: 0%| | 0/4 [00:00<?, ?it/s]
Coefficient: [21.195483171931965]
In [0]:
# Predictions
from pyspark.ml.linalg import Vectors
print(f"{lm.predict(Vectors.dense([10000])):,.2f}")
print(f"{lm.predict(Vectors.dense([20000])):,.2f}")
print(f"{lm.predict(Vectors.dense([30000])):,.2f}")
print(f"{lm.predict(Vectors.dense([40000])):,.2f}")
-9,624.65 202,330.19 414,285.02 626,239.85
In [0]:
lm_results = lm.transform(assembled_data)
display(lm_results.select('Price', 'prediction').limit(2))
| Price | prediction |
|---|---|
| 1059033.5578701235 | 1464424.9504096084 |
| 1505890.91484695 | 1458133.7893437766 |
In [0]:
# actual vs predicted values
pdf = lm_results.toPandas()
sns.lmplot(data=pdf, x='Price', y='prediction', line_kws={"color": "red"}, height=5, aspect=1).set(title='actual vs predicted')
Out[0]:
<seaborn.axisgrid.FacetGrid at 0x7fc4bd9f9550>
In [0]:
# model coeffients, intercept
print(f"Coefficient: {lm.coefficients}")
print(f"Intercept: {lm.intercept}")
print(f"Equation: Price = {lm.coefficients[0]} * Avg_Area_Income + {lm.intercept}")
print(f"R^2 fit metric {lm.summary.r2:.3f}")
Coefficient: [21.195483171931965] Intercept: -221579.47820593425 Equation: Price = 21.195483171931965 * Avg_Area_Income + -221579.47820593425 R^2 fit metric 0.409
In [0]:
# fitted line over model
m = lm.coefficients[0]
b = lm.intercept
pdf['predicted_Y'] = m * pdf['Avg_Area_Income'] + b
sns.lmplot(data=pdf, x='Avg_Area_Income', y='Price', line_kws={"color": "red"}, height=5, aspect=1).set(title='Avg. Area Income vs Price')
sns.lineplot(data=pdf, x='Avg_Area_Income', y='predicted_Y', color='red')
Out[0]:
<Axes: title={'center': 'Avg. Area Income vs Price'}, xlabel='Avg_Area_Income', ylabel='Price'>
multiple variable regression¶
In [0]:
# multiple variable regression
x_cols = list(set(USAhousing.columns) - {'Price', 'Address'})
assembler = VectorAssembler(inputCols=[*x_cols], outputCol='features')
assembled_data = assembler.transform(USAhousing)
display(assembled_data.limit(2))
| Avg_Area_Income | Avg_Area_House_Age | Avg_Area_Number_of_Rooms | Avg_Area_Number_of_Bedrooms | Area_Population | Price | Address | features |
|---|---|---|---|---|---|---|---|
| 79545.45857431678 | 5.682861321615587 | 7.009188142792237 | 4.09 | 23086.800502686456 | 1059033.5578701235 | 208 Michael Ferry Apt. 674 | Map(vectorType -> dense, length -> 5, values -> List(4.09, 79545.45857431678, 7.009188142792237, 23086.800502686456, 5.682861321615587)) |
| 79248.64245482568 | 6.0028998082752425 | 6.730821019094919 | 3.09 | 40173.07217364482 | 1505890.91484695 | 188 Johnson Views Suite 079 | Map(vectorType -> dense, length -> 5, values -> List(3.09, 79248.64245482568, 6.730821019094919, 40173.07217364482, 6.0028998082752425)) |
In [0]:
# split data into training and test sets
train_data, test_data = assembled_data.randomSplit([0.6, 0.4], 24)
# fit the model to the training data
lr = LinearRegression(featuresCol='features', labelCol='Price')
lm = lr.fit(train_data)
Downloading artifacts: 0%| | 0/15 [00:00<?, ?it/s]
Uploading artifacts: 0%| | 0/4 [00:00<?, ?it/s]
In [0]:
# model coefficient, intercept
print(f"Coefficient: {lm.coefficients}")
print(f"Intercept: {lm.intercept}")
equation = '+'.join([f"{p[0]:,.2f} * {p[1]}" for p in zip(lm.coefficients, x_cols)])
print(f"Equation: Price = {equation} + {lm.intercept}")
print(f"R^2 fit metric {lm.summary.r2:.3f}")
Coefficient: [3605.397252460005,21.689817372329568,119901.1195811148,15.300748157693516,167166.3131438628] Intercept: -2658556.8356019533 Equation: Price = 3,605.40 * Avg_Area_Number_of_Bedrooms+21.69 * Avg_Area_Income+119,901.12 * Avg_Area_Number_of_Rooms+15.30 * Area_Population+167,166.31 * Avg_Area_House_Age + -2658556.8356019533 R^2 fit metric 0.919
In [0]:
# apply yje fitted model to the test data
lm_results = lm.transform(test_data)
labeld_predictions = lm_results.select('Price', 'prediction')
display(lm_results.select('Price', 'prediction').limit(2))
| Price | prediction |
|---|---|
| 31140.517620186045 | 99409.9178372696 |
| 723750.0652577134 | 572142.5458968766 |
In [0]:
# evaluate model on test data
from pyspark.ml.evaluation import RegressionEvaluator
regression_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='Price', metricName='r2')
print(f"R^2 on test data = {regression_evaluator.evaluate(lm_results):.3f}")
# multiple independent variable is better
R^2 on test data = 0.916
In [0]:
pdf = labeld_predictions.toPandas()
sns.lmplot(data=pdf, x='Price', y='prediction', line_kws={"color": "red"}, height=5, aspect=1).set(title='actual vs predicted')
Out[0]:
<seaborn.axisgrid.FacetGrid at 0x7fc4c171e9c0>
pipeline¶
In [0]:
# Machine learning pipeline
# regression using pipeline
from pyspark.ml import Pipeline
train_data, test_data = USAhousing.randomSplit([0.6, 0.4], 24)
pipeline = Pipeline(stages=[assembler, lr])
fitted_model = pipeline.fit(train_data)
fitted_model.stages
Downloading artifacts: 0%| | 0/20 [00:00<?, ?it/s]
Uploading artifacts: 0%| | 0/4 [00:00<?, ?it/s]
Out[0]:
[VectorAssembler_58061aa5448a, LinearRegressionModel: uid=LinearRegression_3da97fde2f6a, numFeatures=5]
In [0]:
# retreive the fitted model from pipeline
lm = fitted_model.stages[1]
# model coefficientm intercept
print(f"Coefficient: {lm.coefficients}")
print(f"Intercept: {lm.intercept}")
equation = '+'.join([f"{p[0]:,.2f} * {p[1]}" for p in zip(lm.coefficients, x_cols)])
print(f"Equation: Price = {equation} + {lm.intercept}")
print(f"R^2 fit metric {lm.summary.r2:.3f}")
Coefficient: [1998.606677232051,21.502388556887606,120617.74657879196,15.23501171427518,164683.75900259122] Intercept: -2629038.655070738 Equation: Price = 1,998.61 * Avg_Area_Number_of_Bedrooms+21.50 * Avg_Area_Income+120,617.75 * Avg_Area_Number_of_Rooms+15.24 * Area_Population+164,683.76 * Avg_Area_House_Age + -2629038.655070738 R^2 fit metric 0.917
In [0]:
# aplly the fitted model to the test data
lm_results = fitted_model.transform(test_data)
labeld_predictions = lm_results.select('Price', 'prediction')
display(lm_results.select('Price', 'prediction').limit(2))
| Price | prediction |
|---|---|
| 1077805.577726322 | 905856.8198512727 |
| 299863.0401311839 | 382459.18199658114 |
In [0]:
# evaluate model on test data
from pyspark.ml.evaluation import RegressionEvaluator
regression_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='Price', metricName='r2')
print(f"R^2 on test data = {regression_evaluator.evaluate(lm_results):.3f}")
R^2 on test data = 0.920
In [0]:
# actual vs predicted values
pdf = labeld_predictions.toPandas()
sns.lmplot(data=pdf, x='Price', y='prediction', line_kws={"color": "red"}, height=5, aspect=1).set(title='actual vs predicted')
Out[0]:
<seaborn.axisgrid.FacetGrid at 0x7fc4bdc69f40>
In [0]:
# add additional steps to ML pipeline
from pyspark.ml.feature import StandardScaler
# Run pipeline, fitting the model
train_data, test_data = USAhousing.randomSplit([0.6, 0.4], 24)
pipeline = Pipeline(stages=[VectorAssembler(inputCols=[*x_cols], outputCol='vector_features'),
StandardScaler(inputCol='vector_features', outputCol='features'),
LinearRegression(labelCol='Price')])
fitted_model = pipeline.fit(train_data)
Downloading artifacts: 0%| | 0/30 [00:00<?, ?it/s]
Uploading artifacts: 0%| | 0/4 [00:00<?, ?it/s]
In [0]:
# Pipeline stages from model
fitted_model.stages
Out[0]:
[VectorAssembler_a23ce974de75, StandardScalerModel: uid=StandardScaler_478fe328a565, numFeatures=5, withMean=false, withStd=true, LinearRegressionModel: uid=LinearRegression_29ff763dc53a, numFeatures=5]
In [0]:
# Retreive the fitted model from pipeline
lm = fitted_model.stages[2]
# model coefficientm intercept
print(f"Coefficient: {lm.coefficients}")
print(f"Intercept: {lm.intercept}")
equation = '+'.join([f"{p[0]:,.2f} * {p[1]}" for p in zip(lm.coefficients, x_cols)])
print(f"Equation: Price = {equation} + {lm.intercept}")
print(f"R^2 fit metric {lm.summary.r2:.3f}")
Coefficient: [2470.3928711735416,228321.8648034716,121105.44836963713,151370.824287895,163367.2436056838] Intercept: -2629038.655069568 Equation: Price = 2,470.39 * Avg_Area_Number_of_Bedrooms+228,321.86 * Avg_Area_Income+121,105.45 * Avg_Area_Number_of_Rooms+151,370.82 * Area_Population+163,367.24 * Avg_Area_House_Age + -2629038.655069568 R^2 fit metric 0.917
In [0]:
# Apply the fitted model to the test data
lm_results = fitted_model.transform(test_data)
labeled_predictions = lm_results.select('Price', 'prediction')
display(lm_results.select('Price', 'prediction').limit(2))
| Price | prediction |
|---|---|
| 1077805.577726322 | 905856.8198512159 |
| 299863.0401311839 | 382459.1819968051 |
In [0]:
# evaluate model on test data
from pyspark.ml.evaluation import RegressionEvaluator
regression_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='Price', metricName='r2')
print(f"R^2 on test data = {regression_evaluator.evaluate(lm_results):.3f}")
R^2 on test data = 0.920
In [0]:
# actual vs predicted values
pdf = labeld_predictions.toPandas()
sns.lmplot(data=pdf, x='Price', y='prediction', line_kws={"color": "red"}, height=5, aspect=1).set(title='actual vs predicted')
Out[0]:
<seaborn.axisgrid.FacetGrid at 0x7fc4bdb783b0>
In [0]:
# RFormula 提供了一种简洁的方式来指定特征转换和建模公式
#simplifying with RFormula
from pyspark.ml.feature import RFormula, StandardScaler
# Run pipeline, fitting the model
train_data, test_data = USAhousing.randomSplit([0.6, 0.4], 24)
x_cols = list(set(USAhousing.columns) - {'Price', 'Address'})
formula = "{} ~ {}".format('Price', ' + '.join(x_cols))
print("Formula:{}".format(formula))
pipeline = Pipeline(stages=[RFormula(formula=formula),
StandardScaler(inputCol='features', outputCol='scaled_features'),
LinearRegression(labelCol='Price')])
fitted_model = pipeline.fit(train_data)
Formula:Price ~ Avg_Area_Number_of_Bedrooms + Avg_Area_Income + Avg_Area_Number_of_Rooms + Area_Population + Avg_Area_House_Age
Downloading artifacts: 0%| | 0/65 [00:00<?, ?it/s]
Uploading artifacts: 0%| | 0/4 [00:00<?, ?it/s]
In [0]:
# retreieve the fitted model from pipeline
lm = fitted_model.stages[2]
# model coefficientm intercept
print(f"Coefficient: {lm.coefficients}")
print(f"Intercept: {lm.intercept}")
equation = '+'.join([f"{p[0]:,.2f} * {p[1]}" for p in zip(lm.coefficients, x_cols)])
print(f"Equation: Price = {equation} + {lm.intercept}")
print(f"R^2 fit metric {lm.summary.r2:.3f}")
Coefficient: [1998.606677232051,21.502388556887606,120617.74657879196,15.23501171427518,164683.75900259122] Intercept: -2629038.655070738 Equation: Price = 1,998.61 * Avg_Area_Number_of_Bedrooms+21.50 * Avg_Area_Income+120,617.75 * Avg_Area_Number_of_Rooms+15.24 * Area_Population+164,683.76 * Avg_Area_House_Age + -2629038.655070738 R^2 fit metric 0.917
In [0]:
# Apply the fitted model to the test data
lm_results = fitted_model.transform(test_data)
labeled_predictions = lm_results.select('Price', 'prediction')
display(lm_results.select('Price', 'prediction').limit(2))
| Price | prediction |
|---|---|
| 1077805.577726322 | 905856.8198512727 |
| 299863.0401311839 | 382459.18199658114 |
In [0]:
# evaluate model on test data
from pyspark.ml.evaluation import RegressionEvaluator
regression_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='Price', metricName='r2')
print(f"R^2 on test data = {regression_evaluator.evaluate(lm_results):.3f}")
R^2 on test data = 0.920
In [0]:
pdf = labeld_predictions.toPandas()
sns.lmplot(data=pdf, x='Price', y='prediction', line_kws={"color": "red"}, height=5, aspect=1).set(title='actual vs predicted')
Out[0]:
<seaborn.axisgrid.FacetGrid at 0x7fc4c2baf1a0>
Complex data processing simplified with pipeline¶
- with only two steps, the above example doesn't demonstrate the benefit of the pipeline
- For example, the many steps required by natural language processing are simplified with a pipeline