Use Jupyter Spark to Extract Phenotypic Data from the UKB Database

Jupyter, UKB

Published

October 27, 2023

import databricks.koalas as ks
import dxpy
import dxdata
import pandas as pd
import pyspark
import re
# Initialize Spark
# Spark initialization (Done only once; do not rerun this cell unless you select Kernel -> Restart kernel).
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)
# Automatically discover dispensed dataset ID and load the dataset
dispensed_dataset = dxpy.find_one_data_object(
    typename="Dataset", 
    name="app*.dataset", 
    folder="/", 
    name_mode="glob")
dispensed_dataset_id = dispensed_dataset["id"]
dataset = dxdata.load_dataset(id=dispensed_dataset_id)
participant = dataset['participant']
field_ids = ['31', '21022', '41270']
# for i in range(0,259):
#     field_ids.append('41280_a'+str(i))
print(field_ids)
# This function is used to grab all field names (e.g. "p<field_id>_iYYY_aZZZ") of a list of field IDs
def fields_for_id(field_id):
    from distutils.version import LooseVersion
    field_id = str(field_id)
    fields = participant.find_fields(name_regex=r'^p{}(_i\d+)?(_a\d+)?$'.format(field_id))
    return sorted(fields, key=lambda f: LooseVersion(f.name))
#field_ids = ['31', '22001', '22006', '22019', '22021', '21022']
fields = [participant.find_field(name='eid')] + [participant.find_field(name='p20160_i0')] + [fields_for_id(f)[0] for f in field_ids]
field_description = pd.DataFrame({
    'Field': [f.name for f in fields],
    'Title': [f.title for f in fields],
    'Coding': [f.coding.codes if f.coding is not None else '' for f in fields ]
 })
field_description
samples = dxdata.load_cohort("/Cohort/all_participants") 
samples_df = participant.retrieve_fields(fields = fields, filter_sql = samples.sql, engine=dxdata.connect()).to_koalas()
type(samples_df)
df_phenotype = samples_df.to_pandas()
df_phenotype.shape
df_phenotype.head()
samples_df.to_csv('all_samples.txt', sep='\t', na_rep='NA', index=False, quoting=3)
%%bash -s "/phenotype_data/"
dx upload all_samples_.txt -p --path $1 --brief
%%bash -s "/code/"
dx upload big_allsamples.ipynb -p --path $1 --brief

Source