Use Jupyter Spark to Extract Phenotypic Data from the UKB Database
Jupyter, UKB
import databricks.koalas as ks
import dxpy
import dxdata
import pandas as pd
import pyspark
import re- Initialize Spark
# Initialize Spark
# Spark initialization (Done only once; do not rerun this cell unless you select Kernel -> Restart kernel).
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)- Automatically discover dispensed dataset ID and load the dataset
# Automatically discover dispensed dataset ID and load the dataset
dispensed_dataset = dxpy.find_one_data_object(
typename="Dataset",
name="app*.dataset",
folder="/",
name_mode="glob")
dispensed_dataset_id = dispensed_dataset["id"]
dataset = dxdata.load_dataset(id=dispensed_dataset_id)participant = dataset['participant']field_ids = ['31', '21022', '41270']
# for i in range(0,259):
# field_ids.append('41280_a'+str(i))
print(field_ids)- grab all field names
# This function is used to grab all field names (e.g. "p<field_id>_iYYY_aZZZ") of a list of field IDs
def fields_for_id(field_id):
from distutils.version import LooseVersion
field_id = str(field_id)
fields = participant.find_fields(name_regex=r'^p{}(_i\d+)?(_a\d+)?$'.format(field_id))
return sorted(fields, key=lambda f: LooseVersion(f.name))#field_ids = ['31', '22001', '22006', '22019', '22021', '21022']
fields = [participant.find_field(name='eid')] + [participant.find_field(name='p20160_i0')] + [fields_for_id(f)[0] for f in field_ids]
field_description = pd.DataFrame({
'Field': [f.name for f in fields],
'Title': [f.title for f in fields],
'Coding': [f.coding.codes if f.coding is not None else '' for f in fields ]
})
field_description- load cohort created by UKB RAP
samples = dxdata.load_cohort("/Cohort/all_participants") - retrieve data for fields
samples_df = participant.retrieve_fields(fields = fields, filter_sql = samples.sql, engine=dxdata.connect()).to_koalas()- export data
type(samples_df)
df_phenotype = samples_df.to_pandas()
df_phenotype.shape
df_phenotype.head()
samples_df.to_csv('all_samples.txt', sep='\t', na_rep='NA', index=False, quoting=3)- upload (save) codes and results
%%bash -s "/phenotype_data/"
dx upload all_samples_.txt -p --path $1 --brief%%bash -s "/code/"
dx upload big_allsamples.ipynb -p --path $1 --brief