Skip to content
Open
43 changes: 41 additions & 2 deletions pori_python/ipr/content.spec.json
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,16 @@
"number",
"null"
]
},
"flags": {
"description": "variant flags",
"items": {
"type": "string"
},
"type": [
"array",
"null"
]
}
},
"required": [
Expand Down Expand Up @@ -475,6 +485,16 @@
"null",
"string"
]
},
"flags": {
"description": "variant flags",
"items": {
"type": "string"
},
"type": [
"array",
"null"
]
}
},
"required": [
Expand Down Expand Up @@ -1106,6 +1126,16 @@
"string",
"null"
]
},
"flags": {
"description": "variant flags",
"items": {
"type": "string"
},
"type": [
"array",
"null"
]
}
},
"required": [
Expand Down Expand Up @@ -1161,8 +1191,7 @@
"description": "the type of underlying structural variant",
"example": "deletion",
"type": "string"
},
"exon1": {
}, "exon1": {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

formatting issue here, but still works.

"description": "the 5' (n-terminal) exon",
"example": 1,
"type": [
Expand Down Expand Up @@ -1290,6 +1319,16 @@
"integer",
"null"
]
},
"flags": {
"description": "variant flags",
"items": {
"type": "string"
},
"type": [
"array",
"null"
]
Copy link
Copy Markdown
Member

@mathieulemieux mathieulemieux Mar 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I presume all this validation has also been added to the IPR API side? Ok, just saw DEVSU-2828 I wasn't aware of, so I'm guessing it's a yes.

}
},
"required": [
Expand Down
9 changes: 7 additions & 2 deletions pori_python/ipr/inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
'comments',
'library',
'germline',
'flags',
]

SMALL_MUT_REQ = ['gene', 'proteinChange']
Expand Down Expand Up @@ -98,6 +99,7 @@
'tumourRefCount',
'tumourRefCopies',
'zygosity',
'flags',
]

EXP_REQ = ['gene', 'kbCategory']
Expand Down Expand Up @@ -130,6 +132,7 @@
'rnaReads',
'rpkm',
'tpm',
'flags',
]

SV_REQ = [
Expand Down Expand Up @@ -162,12 +165,13 @@
'tumourDepth',
'germline',
'mavis_product_id',
'flags',
]

SIGV_REQ = ['signatureName', 'variantTypeName']
SIGV_COSMIC = ['signature'] # 1st element used as signatureName key
SIGV_HLA = ['a1', 'a2', 'b1', 'b2', 'c1', 'c2']
SIGV_OPTIONAL = ['displayName']
SIGV_OPTIONAL = ['displayName', 'flags']
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need that flag too on signature variants?

SIGV_KEY = SIGV_REQ[:]


Expand Down Expand Up @@ -278,6 +282,7 @@ def row_key(row: IprSmallMutationVariant) -> Tuple[str, ...]:
return tuple(['small mutation'] + key_vals)

result = validate_variant_rows(rows, SMALL_MUT_REQ, SMALL_MUT_OPTIONAL, row_key)

if not result:
return []

Expand Down Expand Up @@ -336,6 +341,7 @@ def row_key(row: Dict) -> Tuple[str, ...]:
return tuple(['expression'] + [row[key] for key in EXP_KEY])

variants = validate_variant_rows(rows, EXP_REQ, EXP_OPTIONAL, row_key)

result = [cast(IprExprVariant, var) for var in variants]
float_columns = [
col
Expand Down Expand Up @@ -371,7 +377,6 @@ def row_key(row: Dict) -> Tuple[str, ...]:

if errors:
raise ValueError(f'{len(errors)} Invalid expression variants in file')

return result


Expand Down
65 changes: 64 additions & 1 deletion pori_python/ipr/ipr.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,6 @@ def convert_statements_to_alterations(
)
if query_result:
recruitment_statuses[rid] = query_result[0]['recruitmentStatus'] # type: ignore

for statement in statements:
variants = [
cast(Variant, c) for c in statement['conditions'] if c['@class'] in VARIANT_CLASSES
Expand Down Expand Up @@ -229,6 +228,7 @@ def convert_statements_to_alterations(
row['kbContextId'], 'not found'
)
rows.append(row)

return rows


Expand Down Expand Up @@ -727,3 +727,66 @@ def get_kb_disease_matches(
raise ValueError(msg)

return disease_matches


def ensure_str_list(val):
if isinstance(val, str):
return [f.strip() for f in val.split(',') if f.strip()]
if isinstance(val, list):
if not all(isinstance(item, str) for item in val):
raise TypeError('All items in flags must be strings')
return val
raise TypeError(f'Unexpected type in flags field: {type(val).__name__}')


def add_transcript_flags(variant_sources, transcript_flags_df):
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So that function go through all observed variants and add flag(s) to each of them when it applies, based on the tsv? Am I getting it right?

lookup = dict(zip(transcript_flags_df['transcript'], transcript_flags_df['flags']))

for record in variant_sources:
flags_str = lookup.get(record.get('transcript'))
if not flags_str:
continue
# Split on commas and strip whitespace
new_flags = ensure_str_list(str(flags_str))
flags = ensure_str_list(record.setdefault('flags', []))
for new_flag in new_flags:
if new_flag not in flags:
flags.append(new_flag)
record['flags'] = flags

# fusions: check both transcripts for flags and add to the same record
label_map = {'ctermTranscript': 'cterm', 'ntermTranscript': 'nterm'}

for record in variant_sources:
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so we're looping again through all variants, but this time for fusions, based on ctermTranscript and ntermTranscript properties?

flags = ensure_str_list(record.setdefault('flags', []))

for key, label in label_map.items():
transcript = record.get(key)
flags_str = lookup.get(transcript)
if not flags_str:
continue

for flag in ensure_str_list(str(flags_str)):
new_flag = f'{flag} ({label})'
if new_flag not in flags:
flags.append(new_flag)
record['flags'] = flags
return variant_sources


def get_variant_flags(variant_sources):
flags = []
for item in variant_sources:
raw_flags = item.get('flags')
if not raw_flags: # skips None and ''
continue
# create record, removing dupes from flags list
flags.append(
{
'variant': item['key'],
'variantType': item['variantType'],
'flags': list(set([f for f in ensure_str_list(raw_flags) if f])),
}
)
item.pop('flags', None) # remove after extraction
return flags
47 changes: 44 additions & 3 deletions pori_python/ipr/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import jsonschema.exceptions
import logging
import os
import pandas as pd
from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
from typing import Callable, Dict, List, Optional, Sequence, Set

Expand Down Expand Up @@ -46,6 +47,8 @@
get_kb_disease_matches,
get_kb_matches_sections,
select_expression_plots,
get_variant_flags,
add_transcript_flags,
)
from .summary import auto_analyst_comments, get_ipr_analyst_comments
from .therapeutic_options import create_therapeutic_options
Expand Down Expand Up @@ -157,6 +160,12 @@ def command_interface() -> None:
action='store_true',
help='True if ignore extra fields in json',
)
parser.add_argument(
'--transcript_flags',
required=False,
type=file_path,
help='TSV without header, with columns: gene, transcript, comma-separated list of flags',
)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a place where we can see how this file would looks like? With the test data maybe? For questions like: Is 'gene' a gene symbol or an ensembl stable id? Are transcripts versioned? etc.

args = parser.parse_args()

with open(args.content, 'r') as fh:
Expand All @@ -181,6 +190,7 @@ def command_interface() -> None:
upload_json=args.upload_json,
validate_json=args.validate_json,
ignore_extra_fields=args.ignore_extra_fields,
transcript_flags=args.transcript_flags,
)


Expand Down Expand Up @@ -234,7 +244,7 @@ def clean_unsupported_content(upload_content: Dict, ipr_spec: Dict = {}) -> Dict
for key, count in removed_keys.items():
logger.warning(f"IPR unsupported property '{key}' removed from {count} genes.")

drop_columns = ['variant', 'variantType', 'histogramImage']
drop_columns = ['variant', 'variantType', 'histogramImage', 'flags']
Copy link
Copy Markdown
Member

@mathieulemieux mathieulemieux Mar 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it temporary?

# DEVSU-2034 - use a 'displayName'
VARIANT_LIST_KEYS = [
'expressionVariants',
Expand Down Expand Up @@ -281,7 +291,6 @@ def clean_unsupported_content(upload_content: Dict, ipr_spec: Dict = {}) -> Dict

# Removing cosmicSignatures. Temporary
upload_content.pop('cosmicSignatures', None)

return upload_content


Expand Down Expand Up @@ -318,6 +327,7 @@ def ipr_report(
validate_json: bool = False,
ignore_extra_fields: bool = False,
tmb_high: float = TMB_SIGNATURE_HIGH_THRESHOLD,
transcript_flags: str = '',
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Description could be added in the docstring.

) -> Dict:
"""Run the matching and create the report JSON for upload to IPR.

Expand Down Expand Up @@ -386,6 +396,12 @@ def ipr_report(
logger.error('Failed schema check - report variants may be corrupted or unmatched.')
logger.error(f'Failed schema check: {err}')

transcript_flags_df = None
if transcript_flags:
transcript_flags_df = pd.read_csv(
transcript_flags, sep='\t', names=['gene', 'transcript', 'flags']
)

# INPUT VARIANTS VALIDATION & PREPROCESSING (OBSERVED BIOMARKERS)
signature_variants: List[IprSignatureVariant] = preprocess_signature_variants(
[
Expand All @@ -410,6 +426,7 @@ def ipr_report(
expression_variants: List[IprExprVariant] = preprocess_expression_variants(
content.get('expressionVariants', [])
)

# Additional checks
if expression_variants:
check_comparators(content, expression_variants)
Expand Down Expand Up @@ -459,6 +476,10 @@ def ipr_report(
*structural_variants,
] # type: ignore

# ANNOTATING VARIANTS WITH TRANSCRIPT FLAGS
if transcript_flags_df is not None and not transcript_flags_df.empty:
all_variants = add_transcript_flags(all_variants, transcript_flags_df)

# GKB_MATCHES FILTERING
if match_germline:
# verify germline kb statements matched germline observed variants, not somatic variants
Expand Down Expand Up @@ -527,10 +548,24 @@ def ipr_report(
gkb_matches, all_variants, kb_matched_sections['kbMatches']
)

variant_sources = [
v
for source in [
[v for v in small_mutations if v['gene'] in genes_with_variants],
[v for v in copy_variants if v['gene'] in genes_with_variants],
[v for v in expression_variants if v['gene'] in genes_with_variants],
signature_variants,
filter_structural_variants(structural_variants, gkb_matches, gene_information),
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do call that function again with the same params at line 585.

]
for v in source
]
observed_vars_section = get_variant_flags(variant_sources)

# OUTPUT CONTENT
# thread safe deep-copy the original content
output = json.loads(json.dumps(content))
output.update(kb_matched_sections)

output.update(
{
'copyVariants': [
Expand All @@ -550,15 +585,20 @@ def ipr_report(
for s in filter_structural_variants(
structural_variants, gkb_matches, gene_information
)
],
], # TODO NB are we omitting non-matched sv's?
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

by sv's you mean structural variants, not signature variants, right?

'signatureVariants': [trim_empty_values(s) for s in signature_variants],
'genes': gene_information,
'genomicAlterationsIdentified': key_alterations,
'variantCounts': variant_counts,
'analystComments': comments,
'therapeuticTarget': targets,
'observedVariantAnnotations': observed_vars_section,
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is it used for?

}
)

# TODO there are 13 outliers in the test data; if even only three are matched, why are only those three
# shown in the expression section? shouldn't we be seeing the non-kbmatched vars there as well?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In which table exactly?

output.setdefault('images', []).extend(select_expression_plots(gkb_matches, all_variants))

# if input includes hrdScore field, that is ok to pass to db
Expand All @@ -577,6 +617,7 @@ def ipr_report(
if not ipr_conn:
raise ValueError('ipr_url required to upload report')
ipr_spec = ipr_conn.get_spec()

output = clean_unsupported_content(output, ipr_spec)
try:
logger.info(f'Uploading to IPR {ipr_conn.url}')
Expand Down
3 changes: 2 additions & 1 deletion pori_python/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,11 +134,12 @@ def __hash__(self):


class IprVariantBase(TypedDict):
"""Required properties of all variants for IPR."""
"""Required or possible properties of all variants for IPR."""

key: str
variantType: str
variant: str
flags: Optional[List[str]]


class IprGeneVariant(IprVariantBase):
Expand Down
Loading
Loading