Skip to content

Data discovery JSON structure

Elasticsearch data is stored as JSON (JavaScript Object Notation) documents, one for each participant.

Below is a very simplified example of the JSON document that forms the record that Elasticsearch holds for each participant. Note, the sample record shown here is an amalgamation of cancer and rare disease synthetic values. The structure is formed of top level attributes with additional attributes contained in sub-documents. From the structure, you can see how naming certain nested elements that you may wish to search for comes about.

current_age: is a top level element and can be addressed directly
variants. gene_symbol variants is an array of elements nested within the top level, hence to address ‘gene_symbol’ we must prefix it with the containing nested name.
hes_diagnoses.diagnosis There may be many Hospital Episode Statistics diagnoses over time which are represented as an array of hes_diagnoses

Note the querying limitations.

You can query for matches between attributes at the document top-level e.g. gender and current_age.

You can query for matches between attributes at the document top-level and attributes in sub-documents e.g. current_age and diagnosis code.

BUT you cannot query for matches between two sub-document attributes appearing in the same sub-document instance e.g. diagnosis and diagdate. Another example not supported is variant.gene_symbol and variant.tier.

The synthetic and simplified example below is constructed to show several JSON sub-documents and data combinations for many of the document fields. Note that there are fields in the json document not suitable for searching or visualisation in Data Discovery:

Synthetic Example
  "administrative_gender": "Male",
  "affected_status": "Affected",  
  "biological_relationship_to_proband": null,  
  "cancer_analyses": [  
      "disease_type": "Ovarian",  
  "disease_sub_type": "Endometrioid Adenocarcinoma",  
  "preparation_method": "FF",  
  "tumour_type": "PRIMARY",  
  "tissue_source": "SURGICAL_RESECTION",  
  "morphology_snomed_rt": "M83103",  
  "topography_snomed_rt": "T87000",  
  "morphology_snomed_ct": null,  
  "topography_snomed_ct": null,  
  "morphology_icd": null,  
  "topography_icd": null,  
  "library_type": "PCR-Free",  
  "somatic_coding_variants_per_mb": 2.09,  
  "signature1": 35,  
  "signature2": 6,  
  "signature3": null,  
  "signature4": null,  
  "signature5": null,  
  "signature6": null,  
  "signature7": null,  
  "signature8": 18,  
  "signature9": 14,  
  "signature10": null,  
  "signature11": null,  
  "signature12": null,  
  "signature13": 6,  
  "signature14": null,  
  "signature15": null,  
  "signature16": null,  
  "signature17": null,  
  "signature18": 6,  
  "signature19": null,  
  "signature20": null,  
  "signature21": null,  
  "signature22": null,  
  "signature23": null,  
  "signature24": null,  
  "signature25": null,  
  "signature26": null,  
  "signature27": null,  
  "signature28": null,  
  "signature29": null,  
  "signature30": null,  
  "age_of_onset": 52,  
  "age_of_onset_range": "051 to 60"  
  "cancer_participant_diseases": [  
      "cancer_disease_sub_type": "Endometrioid Carcinoma",  
  "cancer_disease_type": "Ovarian"  
  "cancer_participant_quality": "Individuals with Quality Passed Interpreted Genomes",
  "cohort": "100,000 Genomes Project",
  "cross_cohort_affected_status": [
  "cross_cohort_ethnicity": "Asian or Asian British: Pakistani",
  "cross_cohort_family_group_size": [
  "cross_cohort_gender": "Male",
  "cross_cohort_participant_type": "Non Proband",
  "cross_cohort_programme": [
    "Rare Diseases"
  "current_age": 55,  
  "current_age_range": "051 to 60",  
  "family_status": null,  
  "father_affected": null,  
  "fathers_ethnic_category": null,  
  "full_brothers_affected": null,  
  "full_sisters_affected": null,  
  "gms_conditions": [
    "ORPHA:XXX abcdefghijklmnopqrtstuvwxyz"
  "gms_observations": [
      "observation": "HP:0001249 Intellectual disability"
  "gms_programme": [
    "Rare Diseases"
  "gms_referrals": [
      "affected_status": "unaffected",
      "clinical_indication": "Intellectual disability",
      "clinical_indication_code": "R29",
      "is_proband": false,
      "ordering_entity": "XXX Hospital",
      "participant_id": "ppNNNNNNNN",
      "referral_actual_size": 3,
      "referral_id": "rrNNNNNNN",
      "referral_test_expected_number_of_participants": 3
  "handling_gmc_trust": "XXX Hospital",  
  "hes_diagnoses": [  
      "diagdate": "2011-04-15T00:00:00+0000",  
      "diagnosis": "R69 Unknown and unspecified causes of morbidity",  
      "source": "op"  
      "diagdate": "2017-08-08T00:00:00+0000",
      "diagnosis": "C50.9 Breast - unspecified",
      "source": "apc"
      "hes_procedures": [  
          "procedure_date": "2001-01-01T00:00:00+0000",  
      "procedure": "F10.4 Extraction of multiple teeth NEC",  
      "source": "apc"  
      "procedure_date": "2016-04-15T00:00:00+0000",
      "procedure": "Y94.9 Unspecified radiopharmaceutical imaging",
      "source": "op"
  "laboratory_samples": [  
      "sample_source": "BLOOD",  
  "sample_type": "DNA Blood Germline",  
  "dna_sample_type": true  
  "life_status": "Alive",
  "mother_affected": "No",  
  "mothers_ethnic_category": "White: British",  
  "normalised_death_date": null,  
  "participant_and_sample_table": [  
  "participant_ethnic_category": "Not Stated",  
  "participant_families": [  
      "family_group_type": "Trio with Mother and Father",  
  "proband_id": 999000999  
  "participant_id": 999000999,  
  "participant_karyotypic_sex": "Not Supplied",  
  "participant_phenotypic_sex": "Male",  
  "participant_stated_gender": "Male",  
  "participant_type": "Proband",  
  "programme": "Rare Diseases",  
  "rare_diseases_family_id": "999000999",  
  "rare_diseases_participant_diseases": [  
      "age_of_onset": -1,  
  "diagnosis_date": "2010-01-01T00:00:00+0000",  
  "normalised_specific_disease": "Intellectual disability",  
  "normalised_disease_sub_group": "Neurodevelopmental disorders",  
  "normalised_disease_group": "Neurology and neurodevelopmental disorders"  
  "age_of_onset_range": "Pre-natal"  
  "rare_diseases_participant_phenotypes": [  
      "hpo_term": "HP:0012781 Mid-frequency hearing loss"  
  "sact": [  
      "actual_dose_per_administration": 500,  
      "administration_date": "2013-11-15T00:00:00+0000",  
      "administration_route": 1,  
      "analysis_group": "CARBOPLATIN + LIPOSOMAL DOX",  
      "benchmark_group": "CARBOPLATIN + LIPOSOMAL DOX",  
      "chemo_radiation": false,  
      "clinical_trial": 2,  
      "comorbidity_adjustment": "N",  
      "cycle_number": 2,  
      "date_decision_to_treat": "2015-09-03T00:00:00+0000",  
      "date_of_final_treatment": null,  
      "drug_group": "CARBOPLATIN",  
      "height_at_start_of_regimen": 1.39,  
      "intent_of_treatment": "D",  
      "morphology_clean": null,  
      "number_of_cycles_planned": 6,  
      "opcs_delivery_code": "X722",  
      "opcs_procurement_code": "X713",  
      "perf_stat_start_of_cycle_clean": 1,  
      "perf_stat_start_of_reg_clean": 1,  
      "primary_diagnosis": "C56",  
      "programme_number": 1,  
      "regimen_mod_dose_reduction": false,  
      "regimen_mod_stopped_early": false,  
      "regimen_mod_time_delay": false,  
      "regimen_number": 1,  
      "regimen_outcome_summary": null,  
      "stage_at_start": "300",  
      "start_date_of_cycle": "2015-10-10T00:00:00+0000",  
      "start_date_of_regimen": "2016-09-12T00:00:00+0000",  
      "weight_at_start_of_cycle": 69.0,  
      "weight_at_start_of_regimen": 69.3  
  "sarscov2Positive": "No test",
  "sequencing_reports": [  
      "type": "rare disease germline",  
      "genome_build": "GRCh37",  
      "type_and_genome_build": "rare disease germline GRCh37",
      "type_genome_build_version": "rare disease germline GRCh38 V2"
  "survival_days": null,  
  "variants": [  
      "allele_origins": [  
  "assembly": "GRCh37",  
  "consequence_type": [  
  "db_snp_id": [  
  "domain": [  
  "gene_symbol": "ZNF717",  
  "genotype": "alternate_homozygous",  
  "id": 40103686317,  
  "mode_of_inheritance": [  
  "mutation_type": "SNP",  
  "participant_id": 999000999,  
  "role_in_cancer": [  
  "sample_id": [  
  "tier": [  
  "variant_key": "chr3:75729769(CC/G)"  
  "x_hpo_observations": [
        "hpo_term": "HP:0001513 obesity"
  "year_of_birth": 2000