-
Notifications
You must be signed in to change notification settings - Fork 2
Feat/devsu 2830 load flags #107
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
d329813
4859c20
97f4740
29521a6
8e86f75
caa77be
31e6af5
6305d2f
36d6ab0
bbabdd9
90a51bd
dab3e65
7fe28de
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -202,6 +202,16 @@ | |
| "number", | ||
| "null" | ||
| ] | ||
| }, | ||
| "flags": { | ||
| "description": "variant flags", | ||
| "items": { | ||
| "type": "string" | ||
| }, | ||
| "type": [ | ||
| "array", | ||
| "null" | ||
| ] | ||
| } | ||
| }, | ||
| "required": [ | ||
|
|
@@ -475,6 +485,16 @@ | |
| "null", | ||
| "string" | ||
| ] | ||
| }, | ||
| "flags": { | ||
| "description": "variant flags", | ||
| "items": { | ||
| "type": "string" | ||
| }, | ||
| "type": [ | ||
| "array", | ||
| "null" | ||
| ] | ||
| } | ||
| }, | ||
| "required": [ | ||
|
|
@@ -1106,6 +1126,16 @@ | |
| "string", | ||
| "null" | ||
| ] | ||
| }, | ||
| "flags": { | ||
| "description": "variant flags", | ||
| "items": { | ||
| "type": "string" | ||
| }, | ||
| "type": [ | ||
| "array", | ||
| "null" | ||
| ] | ||
| } | ||
| }, | ||
| "required": [ | ||
|
|
@@ -1161,8 +1191,7 @@ | |
| "description": "the type of underlying structural variant", | ||
| "example": "deletion", | ||
| "type": "string" | ||
| }, | ||
| "exon1": { | ||
| }, "exon1": { | ||
| "description": "the 5' (n-terminal) exon", | ||
| "example": 1, | ||
| "type": [ | ||
|
|
@@ -1290,6 +1319,16 @@ | |
| "integer", | ||
| "null" | ||
| ] | ||
| }, | ||
| "flags": { | ||
| "description": "variant flags", | ||
| "items": { | ||
| "type": "string" | ||
| }, | ||
| "type": [ | ||
| "array", | ||
| "null" | ||
| ] | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I presume all this validation has also been added to the IPR API side? Ok, just saw DEVSU-2828 I wasn't aware of, so I'm guessing it's a yes. |
||
| } | ||
| }, | ||
| "required": [ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -60,6 +60,7 @@ | |
| 'comments', | ||
| 'library', | ||
| 'germline', | ||
| 'flags', | ||
| ] | ||
|
|
||
| SMALL_MUT_REQ = ['gene', 'proteinChange'] | ||
|
|
@@ -98,6 +99,7 @@ | |
| 'tumourRefCount', | ||
| 'tumourRefCopies', | ||
| 'zygosity', | ||
| 'flags', | ||
| ] | ||
|
|
||
| EXP_REQ = ['gene', 'kbCategory'] | ||
|
|
@@ -130,6 +132,7 @@ | |
| 'rnaReads', | ||
| 'rpkm', | ||
| 'tpm', | ||
| 'flags', | ||
| ] | ||
|
|
||
| SV_REQ = [ | ||
|
|
@@ -162,12 +165,13 @@ | |
| 'tumourDepth', | ||
| 'germline', | ||
| 'mavis_product_id', | ||
| 'flags', | ||
| ] | ||
|
|
||
| SIGV_REQ = ['signatureName', 'variantTypeName'] | ||
| SIGV_COSMIC = ['signature'] # 1st element used as signatureName key | ||
| SIGV_HLA = ['a1', 'a2', 'b1', 'b2', 'c1', 'c2'] | ||
| SIGV_OPTIONAL = ['displayName'] | ||
| SIGV_OPTIONAL = ['displayName', 'flags'] | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need that flag too on signature variants? |
||
| SIGV_KEY = SIGV_REQ[:] | ||
|
|
||
|
|
||
|
|
@@ -278,6 +282,7 @@ def row_key(row: IprSmallMutationVariant) -> Tuple[str, ...]: | |
| return tuple(['small mutation'] + key_vals) | ||
|
|
||
| result = validate_variant_rows(rows, SMALL_MUT_REQ, SMALL_MUT_OPTIONAL, row_key) | ||
|
|
||
| if not result: | ||
| return [] | ||
|
|
||
|
|
@@ -336,6 +341,7 @@ def row_key(row: Dict) -> Tuple[str, ...]: | |
| return tuple(['expression'] + [row[key] for key in EXP_KEY]) | ||
|
|
||
| variants = validate_variant_rows(rows, EXP_REQ, EXP_OPTIONAL, row_key) | ||
|
|
||
| result = [cast(IprExprVariant, var) for var in variants] | ||
| float_columns = [ | ||
| col | ||
|
|
@@ -371,7 +377,6 @@ def row_key(row: Dict) -> Tuple[str, ...]: | |
|
|
||
| if errors: | ||
| raise ValueError(f'{len(errors)} Invalid expression variants in file') | ||
|
|
||
| return result | ||
|
|
||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -160,7 +160,6 @@ def convert_statements_to_alterations( | |
| ) | ||
| if query_result: | ||
| recruitment_statuses[rid] = query_result[0]['recruitmentStatus'] # type: ignore | ||
|
|
||
| for statement in statements: | ||
| variants = [ | ||
| cast(Variant, c) for c in statement['conditions'] if c['@class'] in VARIANT_CLASSES | ||
|
|
@@ -229,6 +228,7 @@ def convert_statements_to_alterations( | |
| row['kbContextId'], 'not found' | ||
| ) | ||
| rows.append(row) | ||
|
|
||
| return rows | ||
|
|
||
|
|
||
|
|
@@ -727,3 +727,66 @@ def get_kb_disease_matches( | |
| raise ValueError(msg) | ||
|
|
||
| return disease_matches | ||
|
|
||
|
|
||
| def ensure_str_list(val): | ||
| if isinstance(val, str): | ||
| return [f.strip() for f in val.split(',') if f.strip()] | ||
| if isinstance(val, list): | ||
| if not all(isinstance(item, str) for item in val): | ||
| raise TypeError('All items in flags must be strings') | ||
| return val | ||
| raise TypeError(f'Unexpected type in flags field: {type(val).__name__}') | ||
|
|
||
|
|
||
| def add_transcript_flags(variant_sources, transcript_flags_df): | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So that function go through all observed variants and add flag(s) to each of them when it applies, based on the tsv? Am I getting it right? |
||
| lookup = dict(zip(transcript_flags_df['transcript'], transcript_flags_df['flags'])) | ||
|
|
||
| for record in variant_sources: | ||
| flags_str = lookup.get(record.get('transcript')) | ||
| if not flags_str: | ||
| continue | ||
| # Split on commas and strip whitespace | ||
| new_flags = ensure_str_list(str(flags_str)) | ||
| flags = ensure_str_list(record.setdefault('flags', [])) | ||
| for new_flag in new_flags: | ||
| if new_flag not in flags: | ||
| flags.append(new_flag) | ||
| record['flags'] = flags | ||
|
|
||
| # fusions: check both transcripts for flags and add to the same record | ||
| label_map = {'ctermTranscript': 'cterm', 'ntermTranscript': 'nterm'} | ||
|
|
||
| for record in variant_sources: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so we're looping again through all variants, but this time for fusions, based on ctermTranscript and ntermTranscript properties? |
||
| flags = ensure_str_list(record.setdefault('flags', [])) | ||
|
|
||
| for key, label in label_map.items(): | ||
| transcript = record.get(key) | ||
| flags_str = lookup.get(transcript) | ||
| if not flags_str: | ||
| continue | ||
|
|
||
| for flag in ensure_str_list(str(flags_str)): | ||
| new_flag = f'{flag} ({label})' | ||
| if new_flag not in flags: | ||
| flags.append(new_flag) | ||
| record['flags'] = flags | ||
| return variant_sources | ||
|
|
||
|
|
||
| def get_variant_flags(variant_sources): | ||
| flags = [] | ||
| for item in variant_sources: | ||
| raw_flags = item.get('flags') | ||
| if not raw_flags: # skips None and '' | ||
| continue | ||
| # create record, removing dupes from flags list | ||
| flags.append( | ||
| { | ||
| 'variant': item['key'], | ||
| 'variantType': item['variantType'], | ||
| 'flags': list(set([f for f in ensure_str_list(raw_flags) if f])), | ||
| } | ||
| ) | ||
| item.pop('flags', None) # remove after extraction | ||
| return flags | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,6 +6,7 @@ | |
| import jsonschema.exceptions | ||
| import logging | ||
| import os | ||
| import pandas as pd | ||
| from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser | ||
| from typing import Callable, Dict, List, Optional, Sequence, Set | ||
|
|
||
|
|
@@ -46,6 +47,8 @@ | |
| get_kb_disease_matches, | ||
| get_kb_matches_sections, | ||
| select_expression_plots, | ||
| get_variant_flags, | ||
| add_transcript_flags, | ||
| ) | ||
| from .summary import auto_analyst_comments, get_ipr_analyst_comments | ||
| from .therapeutic_options import create_therapeutic_options | ||
|
|
@@ -157,6 +160,12 @@ def command_interface() -> None: | |
| action='store_true', | ||
| help='True if ignore extra fields in json', | ||
| ) | ||
| parser.add_argument( | ||
| '--transcript_flags', | ||
| required=False, | ||
| type=file_path, | ||
| help='TSV without header, with columns: gene, transcript, comma-separated list of flags', | ||
| ) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a place where we can see how this file would looks like? With the test data maybe? For questions like: Is 'gene' a gene symbol or an ensembl stable id? Are transcripts versioned? etc. |
||
| args = parser.parse_args() | ||
|
|
||
| with open(args.content, 'r') as fh: | ||
|
|
@@ -181,6 +190,7 @@ def command_interface() -> None: | |
| upload_json=args.upload_json, | ||
| validate_json=args.validate_json, | ||
| ignore_extra_fields=args.ignore_extra_fields, | ||
| transcript_flags=args.transcript_flags, | ||
| ) | ||
|
|
||
|
|
||
|
|
@@ -234,7 +244,7 @@ def clean_unsupported_content(upload_content: Dict, ipr_spec: Dict = {}) -> Dict | |
| for key, count in removed_keys.items(): | ||
| logger.warning(f"IPR unsupported property '{key}' removed from {count} genes.") | ||
|
|
||
| drop_columns = ['variant', 'variantType', 'histogramImage'] | ||
| drop_columns = ['variant', 'variantType', 'histogramImage', 'flags'] | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it temporary? |
||
| # DEVSU-2034 - use a 'displayName' | ||
| VARIANT_LIST_KEYS = [ | ||
| 'expressionVariants', | ||
|
|
@@ -281,7 +291,6 @@ def clean_unsupported_content(upload_content: Dict, ipr_spec: Dict = {}) -> Dict | |
|
|
||
| # Removing cosmicSignatures. Temporary | ||
| upload_content.pop('cosmicSignatures', None) | ||
|
|
||
| return upload_content | ||
|
|
||
|
|
||
|
|
@@ -318,6 +327,7 @@ def ipr_report( | |
| validate_json: bool = False, | ||
| ignore_extra_fields: bool = False, | ||
| tmb_high: float = TMB_SIGNATURE_HIGH_THRESHOLD, | ||
| transcript_flags: str = '', | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Description could be added in the docstring. |
||
| ) -> Dict: | ||
| """Run the matching and create the report JSON for upload to IPR. | ||
|
|
||
|
|
@@ -386,6 +396,12 @@ def ipr_report( | |
| logger.error('Failed schema check - report variants may be corrupted or unmatched.') | ||
| logger.error(f'Failed schema check: {err}') | ||
|
|
||
| transcript_flags_df = None | ||
| if transcript_flags: | ||
| transcript_flags_df = pd.read_csv( | ||
| transcript_flags, sep='\t', names=['gene', 'transcript', 'flags'] | ||
| ) | ||
|
|
||
| # INPUT VARIANTS VALIDATION & PREPROCESSING (OBSERVED BIOMARKERS) | ||
| signature_variants: List[IprSignatureVariant] = preprocess_signature_variants( | ||
| [ | ||
|
|
@@ -410,6 +426,7 @@ def ipr_report( | |
| expression_variants: List[IprExprVariant] = preprocess_expression_variants( | ||
| content.get('expressionVariants', []) | ||
| ) | ||
|
|
||
| # Additional checks | ||
| if expression_variants: | ||
| check_comparators(content, expression_variants) | ||
|
|
@@ -459,6 +476,10 @@ def ipr_report( | |
| *structural_variants, | ||
| ] # type: ignore | ||
|
|
||
| # ANNOTATING VARIANTS WITH TRANSCRIPT FLAGS | ||
| if transcript_flags_df is not None and not transcript_flags_df.empty: | ||
| all_variants = add_transcript_flags(all_variants, transcript_flags_df) | ||
|
|
||
| # GKB_MATCHES FILTERING | ||
| if match_germline: | ||
| # verify germline kb statements matched germline observed variants, not somatic variants | ||
|
|
@@ -527,10 +548,24 @@ def ipr_report( | |
| gkb_matches, all_variants, kb_matched_sections['kbMatches'] | ||
| ) | ||
|
|
||
| variant_sources = [ | ||
| v | ||
| for source in [ | ||
| [v for v in small_mutations if v['gene'] in genes_with_variants], | ||
| [v for v in copy_variants if v['gene'] in genes_with_variants], | ||
| [v for v in expression_variants if v['gene'] in genes_with_variants], | ||
| signature_variants, | ||
| filter_structural_variants(structural_variants, gkb_matches, gene_information), | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We do call that function again with the same params at line 585. |
||
| ] | ||
| for v in source | ||
| ] | ||
| observed_vars_section = get_variant_flags(variant_sources) | ||
|
|
||
| # OUTPUT CONTENT | ||
| # thread safe deep-copy the original content | ||
| output = json.loads(json.dumps(content)) | ||
| output.update(kb_matched_sections) | ||
|
|
||
| output.update( | ||
| { | ||
| 'copyVariants': [ | ||
|
|
@@ -550,15 +585,20 @@ def ipr_report( | |
| for s in filter_structural_variants( | ||
| structural_variants, gkb_matches, gene_information | ||
| ) | ||
| ], | ||
| ], # TODO NB are we omitting non-matched sv's? | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. by sv's you mean structural variants, not signature variants, right? |
||
| 'signatureVariants': [trim_empty_values(s) for s in signature_variants], | ||
| 'genes': gene_information, | ||
| 'genomicAlterationsIdentified': key_alterations, | ||
| 'variantCounts': variant_counts, | ||
| 'analystComments': comments, | ||
| 'therapeuticTarget': targets, | ||
| 'observedVariantAnnotations': observed_vars_section, | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is it used for? |
||
| } | ||
| ) | ||
|
|
||
| # TODO there are 13 outliers in the test data; if even only three are matched, why are only those three | ||
| # shown in the expression section? shouldn't we be seeing the non-kbmatched vars there as well? | ||
|
|
||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In which table exactly? |
||
| output.setdefault('images', []).extend(select_expression_plots(gkb_matches, all_variants)) | ||
|
|
||
| # if input includes hrdScore field, that is ok to pass to db | ||
|
|
@@ -577,6 +617,7 @@ def ipr_report( | |
| if not ipr_conn: | ||
| raise ValueError('ipr_url required to upload report') | ||
| ipr_spec = ipr_conn.get_spec() | ||
|
|
||
| output = clean_unsupported_content(output, ipr_spec) | ||
| try: | ||
| logger.info(f'Uploading to IPR {ipr_conn.url}') | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
formatting issue here, but still works.