Use Jupyter Spark to Extract Phenotypic Data from the UKB Database
Jupyter, UKB
import databricks.koalas as ks
import dxpy
import dxdata
import pandas as pd
import pyspark
import re
- Initialize Spark
# Initialize Spark
# Spark initialization (Done only once; do not rerun this cell unless you select Kernel -> Restart kernel).
= pyspark.SparkContext()
sc = pyspark.sql.SparkSession(sc) spark
- Automatically discover dispensed dataset ID and load the dataset
# Automatically discover dispensed dataset ID and load the dataset
= dxpy.find_one_data_object(
dispensed_dataset ="Dataset",
typename="app*.dataset",
name="/",
folder="glob")
name_mode= dispensed_dataset["id"]
dispensed_dataset_id = dxdata.load_dataset(id=dispensed_dataset_id) dataset
= dataset['participant'] participant
= ['31', '21022', '41270']
field_ids # for i in range(0,259):
# field_ids.append('41280_a'+str(i))
print(field_ids)
- grab all field names
# This function is used to grab all field names (e.g. "p<field_id>_iYYY_aZZZ") of a list of field IDs
def fields_for_id(field_id):
from distutils.version import LooseVersion
= str(field_id)
field_id = participant.find_fields(name_regex=r'^p{}(_i\d+)?(_a\d+)?$'.format(field_id))
fields return sorted(fields, key=lambda f: LooseVersion(f.name))
#field_ids = ['31', '22001', '22006', '22019', '22021', '21022']
= [participant.find_field(name='eid')] + [participant.find_field(name='p20160_i0')] + [fields_for_id(f)[0] for f in field_ids]
fields = pd.DataFrame({
field_description 'Field': [f.name for f in fields],
'Title': [f.title for f in fields],
'Coding': [f.coding.codes if f.coding is not None else '' for f in fields ]
}) field_description
- load cohort created by UKB RAP
= dxdata.load_cohort("/Cohort/all_participants") samples
- retrieve data for fields
= participant.retrieve_fields(fields = fields, filter_sql = samples.sql, engine=dxdata.connect()).to_koalas() samples_df
- export data
type(samples_df)
= samples_df.to_pandas()
df_phenotype
df_phenotype.shape
df_phenotype.head()'all_samples.txt', sep='\t', na_rep='NA', index=False, quoting=3) samples_df.to_csv(
- upload (save) codes and results
%%bash -s "/phenotype_data/"
-p --path $1 --brief dx upload all_samples_.txt
%%bash -s "/code/"
-p --path $1 --brief dx upload big_allsamples.ipynb