data_lg = data.withColumn("log_price", F.log(data["price"]))
pdf_lg = data_lg.toPandas()

# 读取 CSV
df1 = spark.read.csv("/Volumes/dataset/winemag/winemag-data/winemag-data-130k-v2.csv", header=True, inferSchema=True, multiLine=True, mode="DROPMALFORMED")
df2 = spark.read.csv("/Volumes/dataset/winemag/winemag-data/winemag-data_first150k.csv", header=True, inferSchema=True, multiLine=True, mode="DROPMALFORMED")
print(f"spark.read.csv返回的数据类型: {type(df1)}")
print(f"df1的数据类型: {type(df1)}")
import pandas as pd
pdf1 = df1.toPandas()
print(f"pdf1的数据类型: {type(pdf1)}")
print(f"df1的列：{df1.columns}")
print(f"df2的列：{df2.columns}")
extra_columns = list(set(df1.columns) - set(df2.columns))
print(f"These columns will be dropped: {extra_columns}")
display(df1.limit(2))

spark.read.csv返回的数据类型: <class 'pyspark.sql.connect.dataframe.DataFrame'>
df1的数据类型: <class 'pyspark.sql.connect.dataframe.DataFrame'>
pdf1的数据类型: <class 'pandas.core.frame.DataFrame'>
df1的列：['_c0', 'country', 'description', 'designation', 'points', 'price', 'province', 'region_1', 'region_2', 'taster_name', 'taster_twitter_handle', 'title', 'variety', 'winery']
df2的列：['_c0', 'country', 'description', 'designation', 'points', 'price', 'province', 'region_1', 'region_2', 'variety', 'winery']
These columns will be dropped: ['taster_name', 'taster_twitter_handle', 'title']

# Combined two datasets
df1 = df1.drop(*extra_columns) # drop extra columns in df1
print(df1.columns)
data = df1.union(df2).drop('_c0').distinct()
print(data.columns)
display(data.limit(2))
# data 是合并后的葡萄酒评论数据集，包含以下字段：
# country: 产国
# description: 品酒师对葡萄酒的描述
# designation: 葡萄酒的指定名称
# points: 品酒师评分
# price: 葡萄酒价格
# province: 产省
# region_1: 产区1
# region_2: 产区2
# variety: 葡萄品种
# winery: 酒庄名称

['_c0', 'country', 'description', 'designation', 'points', 'price', 'province', 'region_1', 'region_2', 'variety', 'winery']
['country', 'description', 'designation', 'points', 'price', 'province', 'region_1', 'region_2', 'variety', 'winery']

# Some minor data cleaning
# define string and number UDFs
from typing import Optional
from pyspark.sql import DataFrame
import pyspark.sql.functions as F
import pyspark.sql.types as T
import pandas as pd

# 装饰器用于简化函数功能的扩展，例如在 PySpark 中注册 UDF（用户自定义函数）时，
# 可以通过 @F.udf 装饰器直接将普通 Python 函数转换为 Spark 可用的 UDF，
# 这样可以方便地在 DataFrame 操作中调用，提高代码的可读性和复用性。
@F.udf(T.BooleanType()) # UDF to check if a value is a number
def is_a_number(value: Optional[str]) -> bool:
    if not value:
        return True
    try:
        _ = float(value)
    except ValueError:
        return False
    return True

@F.udf(T.BooleanType())
def is_a_string(value: Optional[str]) -> bool:
    if not value:
        return True
    if isinstance(value, str):
        return True
    return False

CONTINUOUS_COLUMNS = ['points', 'price', ]
CATEGORICAL_COLUMNS = list(set(data.columns) - set(CONTINUOUS_COLUMNS))
print(CONTINUOUS_COLUMNS)
print(CATEGORICAL_COLUMNS)

['points', 'price']
['region_1', 'country', 'designation', 'variety', 'winery', 'province', 'region_2', 'description']

# Spark DataFrame 的 where 方法用于根据条件筛选出满足条件的行，等价于 filter 方法。
# 例如，data.where(~is_a_number('points')) 表示筛选出 'points' 列不是数字的行。
# 语法：data.where(条件表达式)
# 返回：只包含满足条件的行的 DataFrame
display(data.where(~is_a_number('points')))
# display 函数用于在 Databricks notebook 中以可视化表格的形式展示 DataFrame 或其他对象，方便数据浏览和分析。
display(data.where(is_a_number('points')).limit(2))

# DataFrame.count() 用于统计 Spark DataFrame 的总行数，是常用的行动操作之一。
# 例如，print(data.count()) 会输出 data 数据集的行数。
# 该方法会触发 Spark 的实际计算，将所有分布式分区的数据聚合，返回一个整数结果。
print(data.count())
bad_rows = data.where(~is_a_number('points') | ~is_a_number('price')).count()
print(bad_rows)
print(f"Dropping {bad_rows} with numeric values in non-numeric columns")
data = data.where(is_a_string('province') & is_a_string('region_1') & is_a_string('region_2') & is_a_string('variety'))

170491
0
Dropping 0 with numeric values in non-numeric columns

# 打印转换前的连续型数据列的数据类型
print("Before casting:")
for col in CONTINUOUS_COLUMNS:
    print(f"{col}: {data.schema[col].dataType}")
# 将连续型数据列转换为 double 类型，确保后续分析和计算的数值精度
for col in CONTINUOUS_COLUMNS:
    data = data.withColumn(col, F.col(col).cast('double'))

Before casting:
points: StringType()
price: StringType()

# Drop rows with null values in continuous columns
# *CONTINUOUS_COLUMNS 的含义是将列表 CONTINUOUS_COLUMNS 中的元素作为单独的参数传递
# 例如，如果 CONTINUOUS_COLUMNS = ['points', 'price']，则 *CONTINUOUS_COLUMNS 等价于 'points', 'price'
data = data.na.drop(subset=[*CONTINUOUS_COLUMNS])
# Drop rows with null values in other columns
data = data.dropna(subset=['variety', 'country'])
display(data.limit(2))

# Basic summary
# data summary
display(data.summary())

import pyspark.sql.types as T
import pyspark.sql.functions as F
import pandas as pd

# 使用 pandas_udf 定义一个 Spark UDF，将评分 points 列分为等级 grade
# 评分区间划分：A: >=95, B: >=85, C: >=75, D: 其他
@F.pandas_udf(T.StringType())
def points_category(points: pd.Series) -> pd.Series:
    def category(points):
        if points >= 95:
            return 'A'
        elif points >= 85:
            return 'B'
        elif points >= 75:
            return 'C'
        else:
            return 'D'
    return points.apply(category)

data = data.withColumn('grade', points_category(data['points'])).orderBy('grade', 'country', 'variety')

my_columns = list(set(data.columns) - {'description'})
data = data.select(*my_columns)
display(data.limit(2))

# 采样大数据集并转换为 Pandas DataFrame，便于后续绘图和分析
print(f"Original rows: {data.count()}")
percent = 25/100
sample = data.sample(percent, 3)
print(f"Sampled {percent:.2f}% rows: {sample.count():,}")

Original rows: 157598
Sampled 0.25% rows: 39,156

# Full dataset to pandas
pdf = data.toPandas()

import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
# databricks (sampled) histogram of price
print(data.count())
display(data.select('price').limit(3))

157598

# Price distribution plot
sns.displot(data=pdf, x='price', kde=True, bins=25, height=5, aspect=2).set(title='Prices')

<seaborn.axisgrid.FacetGrid at 0xff6a43b46960>

# 为什么使用 log(price)？
# 价格分布通常高度偏斜（右偏），大部分葡萄酒价格集中在较低区间，少数价格极高。
# 对价格取对数（log(price)）可以：
# 1. 缓解极端值影响，使分布更接近正态，有利于可视化和统计分析。
# 2. 便于发现价格变化的相对影响（如价格翻倍），而非绝对变化。
# 3. 在回归等建模场景下，提升模型拟合效果和解释性。
import pyspark.sql.functions as F
# Spark SQL 函数如 F.log('price')、F.col('price') 等，参数可以直接写列名字符串（如 'price'），
# 因为这些函数会在 DataFrame 的上下文中查找对应的列，无需指定 DataFrame 名称。
# 例如，data.select(F.log('price')) 会在 data 这个 DataFrame 中查找 'price' 列并对其取对数。
# 这与 Pandas 的 df['price'] 不同，Pandas 需显式指定 DataFrame。
# Spark 的 select、withColumn、filter 等方法都支持这种写法，便于链式操作和表达式组合。
display(data.select(F.log('price')).limit(3))

# 在 Spark DataFrame data 中新增一列 'lg_price'，其值为 'price' 列取自然对数后的结果。
# F.log(data['price']) 表示对 'price' 列应用 Spark SQL 的 log 函数（自然对数），
# withColumn 用于添加新列或替换已有列，结果返回新的 DataFrame data_lg。
data_lg = data.withColumn('lg_price', F.log('price'))
pdf_lg = data_lg.toPandas()
sns.displot(data=pdf_lg, x='lg_price', kde=True, bins=25, height=5, aspect=2).set(title='Log Prices')

<seaborn.axisgrid.FacetGrid at 0xff69f68c6f60>

# Sample of Spark log price data plotted from Pandas (for big data)
data_lg = data.withColumn('lg_price', F.log(data['price']))
percent = 40/100
# .sample(fraction, seed) 是 Spark DataFrame 的采样方法
# fraction: 采样比例（0~1之间），如 0.4 表示采样 40% 的数据
# seed: 随机种子，保证采样结果可复现
# 返回一个新的 DataFrame，包含随机采样的部分数据
pdf_lg = data_lg.sample(percent, 3).toPandas()
sns.displot(data=pdf_lg, x='lg_price', kde=True, bins=25, height=5, aspect=2).set(title='Log Prices (sampled)')

<seaborn.axisgrid.FacetGrid at 0xff69f0d22960>

# More distribution plots
# Price<200 distribution plot (with kde)
pdf_200 = pdf[pdf['price']<200]
sns.displot(data=pdf_200, x='price', kde=True, bins=25, height=5, aspect=2).set(title='Prices < 200')

<seaborn.axisgrid.FacetGrid at 0xff69f1b32810>

# .alias() 用于为 DataFrame 列或聚合结果指定新的列名，便于后续引用和结果展示。
country_pdf = data.groupby('country').agg(F.count('country').alias('count'), 
                                                  F.avg('price').alias('price'),
                                                  F.avg('points').alias('points'))\
                                                    .orderBy('count', ascending=False).where("count > 1000").toPandas()
countries = country_pdf['country'].head(10).tolist()
# 只保留在 countries 列表中的国家，并绘制分布图
plot_data = pdf[pdf['country'].isin(countries) & (pdf['price']<200)]
sns.displot(data=plot_data, x='price', bins=25, hue='country', height=4, aspect=2).set(title='Prices < 200 by Country')

<seaborn.axisgrid.FacetGrid at 0xff69f13751f0>

# 按品种和国家分组的价格分布图（Facet Grid）
# countries: 选取评论数最多的前3个国家
countries = country_pdf['country'].head(3).tolist()
# varieties: 选取评论数最多的前3个葡萄品种
variety_lists = data.groupby('variety').agg(F.count('variety').alias('count'))\
    .orderBy('count', ascending=False).select('variety').limit(3).toPandas().values.tolist()
varieties = [item for sublist in variety_lists for item in sublist]
# 只保留价格小于200的样本，按品种和国家分面，按等级着色
sns.displot(data=pdf[(pdf['country'].isin(countries)) & (pdf['variety'].isin(varieties)) & (pdf['price']<200)], 
            x='price', bins=25, hue='grade', row='variety', col='country', height=3, aspect=1.2)

<seaborn.axisgrid.FacetGrid at 0xff69f0f8e480>

# points 分布图
# 使用 Seaborn 绘制评分（points）分布直方图，带核密度估计（kde）
# 评分区间通常用于评估葡萄酒品质，分布图有助于观察评分集中区间和极端值
sns.displot(data=pdf, x='points', kde=True, bins=25, height=5, aspect=2).set(title='Points')

<seaborn.axisgrid.FacetGrid at 0xff69f0a0b020>

# 价格与评分的分布图（二维直方图）
sns.displot(data=pdf, x='price', y='points', height=5, aspect=2).set(title='Price vs Points')

<seaborn.axisgrid.FacetGrid at 0xff69f1b13410>

sns.displot(data=pdf[pdf['price']<200], x='price', y='points', kind='kde', height=5, aspect=2).set(title='Price vs Points with KDE')

<seaborn.axisgrid.FacetGrid at 0xff69ecf762d0>

# Scatter plots

sns.scatterplot(data=pdf, x='price', y='points')

<Axes: xlabel='price', ylabel='points'>

# relplot 是 Seaborn 的高级接口，可以绘制散点图（kind='scatter'）和折线图（kind='line'），支持 Facet Grid（分面），适合多变量分组可视化。
# scatterplot 是底层函数，只绘制单一散点图，参数更直接，适合简单的二维数据可视化。
# relplot 默认 kind='scatter'，但可以通过参数切换为折线图；scatterplot 仅用于散点图。
sns.relplot(data=pdf, x='price', y='points', kind='scatter')

<seaborn.axisgrid.FacetGrid at 0xff69ebb19c40>

# sns.pairplot 用于可视化多个连续变量之间的两两关系。
# 对于每一对变量，绘制散点图；对角线绘制每个变量的分布直方图。
# 便于发现变量间的相关性、分布特征和异常值。
sns.pairplot(pdf)

<seaborn.axisgrid.PairGrid at 0xff69e9fd4da0>

# 相关性热力图（Correlation Heatmap）
# 用于展示各数值型变量之间的相关性，便于发现变量间的线性关系
plt.figure(figsize=(7, 7))
# smaller (correlation) numbers are darker
sns.heatmap(pdf.corr(), annot=True)

/home/spark-e79ba7c1-7b16-4f29-9209-0b/.ipykernel/2757/command-4624715461691492-1142457085:6: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  sns.heatmap(pdf.corr(), annot=True)

<Axes: >

# 透视热力图：展示不同国家和品种的平均评分（points）
# 行为葡萄品种（variety），列为国家（country），单元格为平均评分
# 便于比较各国各品种的评分高低，发现高分集中区
# 只选取评论数最多的前10个国家和品种，且价格小于200
# sns.heatmap 用于绘制热力图，annot=True 显示数值
plt.figure(figsize=(7,7))
countires = country_pdf['country'].head(10).tolist()
# Top n varieties (by reviews)
variesty_lists = data.groupBy('variety').agg(F.count('variety').alias('count'))\
    .orderBy('count', ascending=False).select('variety').limit(10).toPandas().values.tolist()
varieties = [item for sublist in variesty_lists for item in sublist]
pdfp = pdf[(pdf['country'].isin(countries)) & (pdf['variety'].isin(varieties)) & (pdf['price'] < 200)]
points = pdfp.pivot_table(index='variety', columns='country', values='points')
display(points)

<Figure size 700x700 with 0 Axes>

sns.heatmap(points, annot=True)

<Axes: xlabel='country', ylabel='variety'>

# clustermap for top varieties (price<200)
# 1. 获取评论数最多的前15个葡萄品种
variesty_lists = data.groupBy('variety').agg(F.count('variety').alias('count'))\
    .orderBy('count', ascending=False).select('variety').limit(15).toPandas().values.tolist()
varieties = [item for sublist in variesty_lists for item in sublist]
# 2. 按品种聚合，计算平均价格和平均评分
vdata = data.groupBy('variety').agg(F.avg('price').alias('price'), F.avg('points').alias('points')).toPandas()
# 3. 只保留前15个品种且平均价格小于200的数据
pdfp = vdata[vdata['variety'].isin(varieties) & (vdata['price']<200)]
# 4. 构建聚类数据，设置品种为索引
cluster = pdfp[['variety', 'price', 'points']]
cluster = cluster.set_index('variety', inplace=False)
# 5. 绘制聚类热力图，展示品种间的价格和评分聚类关系
sns.clustermap(cluster, figsize=(10,10), annot=True)
# light color = low value; dark color = high value

<seaborn.matrix.ClusterGrid at 0xff69e7d39520>

# 绘制价格与评分的回归线图，红色线表示拟合的回归关系（points ~ price）
sns.regplot(data=pdf, x='price', y='points', line_kws={'color':'red'}).set(title='Price vs Points with Regression Line (points ~ price)')

[Text(0.5, 1.0, 'Price vs Points with Regression Line (points ~ price)')]

# 回归分析：价格 ~ 评分，按葡萄品种分色（hue），每个品种用不同 marker
# lmplot 支持 Facet Grid，可用于多变量分组回归可视化
countries = country_pdf['country'].head(5).tolist() 

# 选取评论数最多的前 n 个葡萄品种
n = 3
varieties = data.groupBy('variety').agg(F.count('variety').alias('count'))\
    .orderBy('count', ascending=False).select('variety').limit(n).toPandas()['variety'].values.tolist()

# 构建较小的数据集，筛选指定国家、品种，且价格小于 200
pdf_s = pdf[(pdf['country'].isin(countries)) & (pdf['variety'].isin(varieties)) & (pdf['price']<200)].head(200)

markers = ['.', ',', 'o', 'v', '^', '<', '>', '1', '2', '3', '4', '8', 's', 'p', '*', 'h', 'H', '+', 'x', 'D', 'd', '|', '_', 'P', 'X']

sns.set_context('paper', font_scale=1.4)

# 绘制分组回归图，每个品种用不同 marker，点大小和边框可调
sns.lmplot(
    data=pdf_s,
    x='price',
    y='points',
    hue='variety',
    markers=markers[:n],
    scatter_kws={'s':100, 'linewidths':0.5, 'edgecolor': 'w'},
    height=7,
    aspect=1.2
)

<seaborn.axisgrid.FacetGrid at 0xff69e82b6bd0>

#facet
# 按国家和品种分面，绘制价格与评分的回归图，每个子图对应一个国家和品种组合
sns.lmplot(
    data=pdf_s,
    x='price',
    y='points',
    col='variety',
    row='country',
    scatter_kws={
        's': 100,
        'linewidths': 0.5,
        'edgecolor': 'w'
    },
    height=7,
    aspect=1.2
)

<seaborn.axisgrid.FacetGrid at 0xfff450445ac0>

# 按国家分组，统计评论数、平均价格和平均评分
# 只保留评论数大于1000的国家，按评论数降序排列
country_pdf = data.groupby('country').agg(F.count('country').alias('count'), \
                                                    F.avg('price').alias('price'), \
                                                    F.avg('points').alias('points'))\
                                        .orderBy('count', ascending=False).where("count > 1000").toPandas()

sns.catplot(data=country_pdf, x='country', y='count', kind='bar', height=5, aspect=2).set(title='Reviews per Country (count > 1000)')

<seaborn.axisgrid.FacetGrid at 0xff69e44c6f00>

# 与上面类似，但为水平条形图，仅保留评论数大于2000的品种
variety_pdf = data.groupby('variety').agg(F.count('variety').alias('count'), \
                                                    F.avg('price').alias('price'), \
                                                    F.avg('points').alias('points'))\
                                        .orderBy('count', ascending=False).where("count > 2000").toPandas()

sns.catplot(data=variety_pdf, x='count', y='variety', kind='bar', height=5, aspect=2, orient='h').set(title='Reviews per Variety (count > 2000)')

<seaborn.axisgrid.FacetGrid at 0xff69e824c200>

# 按等级分组，绘制价格小提琴图，展示不同等级的价格分布情况
sns.catplot(data=pdf, x='grade', y='price', kind='violin', height=5, aspect=2).set(title='Price')

<seaborn.axisgrid.FacetGrid at 0xfff468aa8080>

# 按等级分组，绘制价格小提琴图，仅展示价格小于200的分布情况
sns.catplot(data=pdf[pdf['price']<200], x='grade', y='price', kind='violin', height=5, aspect=2).set(title='Price < 200')

<seaborn.axisgrid.FacetGrid at 0xff69e8b1fef0>

# 按对数价格分布绘制小提琴图，展示价格的对数变换后分布情况
data_lg = data.withColumn('lg_price', F.log(data['price']))
pdf_lg = data_lg.toPandas()
sns.catplot(data=pdf_lg, x='lg_price', kind='violin', height=5, aspect=2).set(title='Log Price')

<seaborn.axisgrid.FacetGrid at 0xff69f1079e20>

# Price per (top n) country violin plot (vertical)

# 获取评论数最多的前10个国家
countries = country_pdf['country'].head(10).tolist()
# 获取评论数最多的前3个葡萄品种
variety_lists = data.groupBy('variety').agg(F.count('variety').alias('count'))\
    .orderBy('count', ascending=False).select('variety').limit(3).toPandas().values.tolist()

# 按国家分组，筛选指定国家和品种，且价格小于200，绘制价格分布小提琴图
sns.catplot(
    data=pdf[pdf['country'].isin(countries) & pdf['variety'].isin(varieties) & (pdf['price']<200)],
    x='country',
    y='price',
    kind='violin',
    height=5,
    aspect=2
)

<seaborn.axisgrid.FacetGrid at 0xff69de7bff20>

# 按评论数最多的前10个国家和前4个葡萄品种筛选，且价格小于200
# 绘制分组箱线图，x轴为价格，y轴为国家，按品种分色，展示各国各品种的价格分布

countries = country_pdf['country'].head(10).tolist()

# 获取评论数最多的前10个国家
varieties = (
    data.groupBy('variety')
    .agg(F.count('variety').alias('count'))
    .orderBy('count', ascending=False)
    .select('variety')
    .limit(4)
    .toPandas()['variety']
    .tolist()
)

# 按评论数最多的前10个国家和前4个葡萄品种筛选，且价格小于200
# countries 为评论数最多的前10个国家列表
filtered_pdf = pdf[
    pdf['country'].isin(countries) &
    pdf['variety'].isin(varieties) &
    (pdf['price'] < 200)
]

sns.catplot(
    data=filtered_pdf,
    x='price',
    y='country',
    hue='variety',
    kind='box',
    height=5,
    aspect=2
)

<seaborn.axisgrid.FacetGrid at 0xff69de4c0110>

# 按评论数最多的前10个国家和前3个葡萄品种筛选，且价格小于200
# 绘制分面箱线图，x轴为国家，y轴为价格，按品种分面，展示各国各品种的价格分布
# 便于比较不同国家和品种的价格分布特征
# Top n countries (by reviews)
countries = country_pdf['country'].head(10).tolist()

# Top n varieties (by reviews)
varieties = (
    data.groupBy('variety')
    .agg(F.count('variety').alias('count'))
    .orderBy('count', ascending=False)
    .select('variety')
    .limit(3)
    .toPandas()['variety']
    .tolist()
)

filtered_pdf = pdf[
    pdf['country'].isin(countries) &
    pdf['variety'].isin(varieties) &
    (pdf['price'] < 200)
]

sns.catplot(
    data=filtered_pdf,
    x='country',
    y='price',
    col='variety',
    kind='box',
    height=5,
    aspect=2
)

<seaborn.axisgrid.FacetGrid at 0xff69e74dd730>

特点	Spark DataFrame	Pandas DataFrame
处理规模	分布式，适合大数据（TB级）	单机，适合中小数据（GB级）
内存管理	自动分布在集群节点，内存高效	依赖本地内存，易受限
API风格	类SQL，惰性执行（lazy evaluation）	类Python，立即执行（eager）
数据类型支持	支持复杂类型（Array, Map, Struct等）	主要支持基础类型
计算性能	可并行处理，适合批量和流式数据	单线程处理，适合交互式分析
容错性	高，自动恢复失败任务	低，需手动处理异常
生态集成	与大数据生态（HDFS, Hive, Delta等）紧密	主要与Python数据科学库集成

功能	Spark DataFrame 示例	Pandas DataFrame 示例
选取列	`df.select("col1", "col2")`	`df[["col1", "col2"]]`
过滤	`df.filter(df.col > 0)`	`df[df["col"] > 0]`
分组聚合	`df.groupBy("col").agg(...)`	`df.groupby("col").agg(...)`
缺失值处理	`df.dropna()`	`df.dropna()`
新增列	`df.withColumn("new", ...)`	`df["new"] = ...`
排序	`df.orderBy("col")`	`df.sort_values("col")`
合并	`df.join(df2, ...)`	`pd.merge(df, df2, ...)`
转换为Pandas	`df.toPandas()`	-

_c0	country	description	designation	points	price	province	region_1	region_2	taster_name	taster_twitter_handle	title	variety	winery
0	Italy	Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity.	Vulkà Bianco	87	null	Sicily & Sardinia	Etna	null	Kerin O’Keefe	@kerinokeefe	Nicosia 2013 Vulkà Bianco (Etna)	White Blend	Nicosia
1	Portugal	This is ripe and fruity, a wine that is smooth while still structured. Firm tannins are filled out with juicy red berry fruits and freshened with acidity. It's already drinkable, although it will certainly be better from 2016.	Avidagos	87	15.0	Douro	null	null	Roger Voss	@vossroger	Quinta dos Avidagos 2011 Avidagos Red (Douro)	Portuguese Red	Quinta dos Avidagos

country	description	designation	points	price	province	region_1	region_2	variety	winery
Italy	Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity.	Vulkà Bianco	87	null	Sicily & Sardinia	Etna	null	White Blend	Nicosia
Portugal	This is ripe and fruity, a wine that is smooth while still structured. Firm tannins are filled out with juicy red berry fruits and freshened with acidity. It's already drinkable, although it will certainly be better from 2016.	Avidagos	87	15.0	Douro	null	null	Portuguese Red	Quinta dos Avidagos

country	description	designation	points	price	province	region_1	region_2	variety	winery
US	Tart and snappy, the flavors of lime flesh and rind dominate. Some green pineapple pokes through, with crisp acidity underscoring the flavors. The wine was all stainless-steel fermented.	null	87.0	14.0	Oregon	Willamette Valley	Willamette Valley	Pinot Gris	Rainstorm
Portugal	This is ripe and fruity, a wine that is smooth while still structured. Firm tannins are filled out with juicy red berry fruits and freshened with acidity. It's already drinkable, although it will certainly be better from 2016.	Avidagos	87.0	15.0	Douro	null	null	Portuguese Red	Quinta dos Avidagos

大数据分析与挖掘¶

02. how to do exploratory data analysis in PySpark ¶

使用 Spark 数据绘图¶

Seaborn plotting¶

流程¶

下载和读取数据集¶

Spark DataFrame 与 Pandas DataFrame 的差异¶

主要函数差异举例¶

dataset to pandas¶

Common plots using Seaborn¶

考虑数据存储位置¶

推荐做法¶

relational plots¶

Categorical plots¶

country	description	designation	points	price	province	region_1	region_2	variety	winery
France	Fruity and ripe, the wine also has some solid structure to give it a core of dark tannins. The black-currant fruits are developing well, bringing their crispness to the party. Drink this wine from 2018.	null	89	14.0	Bordeaux	Bordeaux Supérieur	null	Bordeaux-style Red Blend	Château Bois Chantant
Austria	This is a crisp, but also rich style of Federspiel. It is full of ripe pear and apple flavor as well as a fine pinch of pepper. Acidity creeps in the background to balance this ready-to-drink wine. Screwcap.	Loibner Klostersatz Federspiel	90	null	Wachau	null	null	Grüner Veltliner	F X Pichler

summary	country	description	designation	points	price	province	region_1	region_2	variety	winery
count	157598	157598	110728	157598	157598	157598	131982	67301	157598	157598
mean	null	null	1438.9025128205128	88.19423469840987	34.66431680605084	null	null	null	null	Infinity
stddev	null	null	6668.827060821776	3.15080267784541	39.8498680318638	null	null	null	null	NaN
min	Albania	"An oddball of an eiswein. Bizarre earthy, brothy notes combine with honeyed sweetness to make for an ""interesting"" tasting experience."	"""M"""	80.0	4.0	Achaia	Abruzzo	California Other	Abouriou	1+1=3
25%	null	null	120.0	86.0	16.0	null	null	null	null	1850.0
50%	null	null	446.0	88.0	25.0	null	null	null	null	1850.0
75%	null	null	1769.0	90.0	40.0	null	null	null	null	1919.0
max	Uruguay	“Wow” is the first word that comes to mind when tasting this superconcentrated and immensely fruity wine. It practically overwhelms the nose and the palate with ripe, jammy boysenberry and blackberry, sprinkled with licorice and black-pepper nuances that add attractive complexity. It's a big wine all the way, but not extreme in alcohol.	“Champ” Lightnin' Lane	100.0	3300.0	Župa	Zonda Valley	Willamette Valley	Žilavka	Štoka

region_1	grade	country	designation	variety	winery	province	points	region_2	price
Tupungato	A	Argentina	Gran Corte Las Divas Vineyard	Bordeaux-style Red Blend	Riglos	Mendoza Province	96.0	null	50.0
Mendoza	A	Argentina	Vineyard Selection	Bordeaux-style Red Blend	Finca Perdriel	Mendoza Province	95.0	null	62.0

France	Italy	US
88.02736248236954	88.85714285714286	89.46805678793257
84.91304347826087	89.40506329113924	88.77355438738374
89.2088888888889	88.24716553287982	88.20004495392223
85.8273381294964	88.7816091954023	87.02699597932222
89.2566271855612	86.66666666666667	89.37784911717496
87.72897196261682	88.80836886993603	87.68922108575924
90.1875	87.59375	87.82698795180723
88.13577799801784	86.81538461538462	87.2164054684895
89.21036585365853	88.9119496855346	89.00535117056856
null	null	87.31621271076524

ln(price)
3.912023005428146
4.127134385045092
4.304065093204169