bcgsc · elewis2 · Feb 23, 2026 · Feb 24, 2026 · Feb 24, 2026 · Mar 4, 2026
diff --git a/pori_python/ipr/content.spec.json b/pori_python/ipr/content.spec.json
@@ -202,6 +202,16 @@
                             "number",
                             "null"
                         ]
+                    },
+                    "flags": {
+                        "description": "variant flags",
+                        "items": {
+                            "type": "string"
+                        },
+                        "type": [
+                            "array",
+                            "null"
+                        ]
                     }
                 },
                 "required": [
@@ -475,6 +485,16 @@
                             "null",
                             "string"
                         ]
+                    },
+                    "flags": {
+                        "description": "variant flags",
+                        "items": {
+                            "type": "string"
+                        },
+                        "type": [
+                            "array",
+                            "null"
+                        ]
                     }
                 },
                 "required": [
@@ -1106,6 +1126,16 @@
                             "string",
                             "null"
                         ]
+                    },
+                    "flags": {
+                        "description": "variant flags",
+                        "items": {
+                            "type": "string"
+                        },
+                        "type": [
+                            "array",
+                            "null"
+                        ]
                     }
                 },
                 "required": [
@@ -1161,8 +1191,7 @@
                         "description": "the type of underlying structural variant",
                         "example": "deletion",
                         "type": "string"
-                    },
-                    "exon1": {
+                    },                    "exon1": {
                         "description": "the 5' (n-terminal) exon",
                         "example": 1,
                         "type": [
@@ -1290,6 +1319,16 @@
                             "integer",
                             "null"
                         ]
+                    },
+                    "flags": {
+                        "description": "variant flags",
+                        "items": {
+                            "type": "string"
+                        },
+                        "type": [
+                            "array",
+                            "null"
+                        ]
                     }
                 },
                 "required": [

diff --git a/pori_python/ipr/inputs.py b/pori_python/ipr/inputs.py
@@ -60,6 +60,7 @@
     'comments',
     'library',
     'germline',
+    'flags',
 ]
 
 SMALL_MUT_REQ = ['gene', 'proteinChange']
@@ -98,6 +99,7 @@
     'tumourRefCount',
     'tumourRefCopies',
     'zygosity',
+    'flags',
 ]
 
 EXP_REQ = ['gene', 'kbCategory']
@@ -130,6 +132,7 @@
     'rnaReads',
     'rpkm',
     'tpm',
+    'flags',
 ]
 
 SV_REQ = [
@@ -162,12 +165,13 @@
     'tumourDepth',
     'germline',
     'mavis_product_id',
+    'flags',
 ]
 
 SIGV_REQ = ['signatureName', 'variantTypeName']
 SIGV_COSMIC = ['signature']  # 1st element used as signatureName key
 SIGV_HLA = ['a1', 'a2', 'b1', 'b2', 'c1', 'c2']
-SIGV_OPTIONAL = ['displayName']
+SIGV_OPTIONAL = ['displayName', 'flags']
 SIGV_KEY = SIGV_REQ[:]
 
 
@@ -278,6 +282,7 @@ def row_key(row: IprSmallMutationVariant) -> Tuple[str, ...]:
         return tuple(['small mutation'] + key_vals)
 
     result = validate_variant_rows(rows, SMALL_MUT_REQ, SMALL_MUT_OPTIONAL, row_key)
+
     if not result:
         return []
 
@@ -336,6 +341,7 @@ def row_key(row: Dict) -> Tuple[str, ...]:
         return tuple(['expression'] + [row[key] for key in EXP_KEY])
 
     variants = validate_variant_rows(rows, EXP_REQ, EXP_OPTIONAL, row_key)
+
     result = [cast(IprExprVariant, var) for var in variants]
     float_columns = [
         col
@@ -371,7 +377,6 @@ def row_key(row: Dict) -> Tuple[str, ...]:
 
     if errors:
         raise ValueError(f'{len(errors)} Invalid expression variants in file')
-
     return result
 
 

diff --git a/pori_python/ipr/ipr.py b/pori_python/ipr/ipr.py
@@ -160,7 +160,6 @@ def convert_statements_to_alterations(
             )
             if query_result:
                 recruitment_statuses[rid] = query_result[0]['recruitmentStatus']  # type: ignore
-
     for statement in statements:
         variants = [
             cast(Variant, c) for c in statement['conditions'] if c['@class'] in VARIANT_CLASSES
@@ -229,6 +228,7 @@ def convert_statements_to_alterations(
                     row['kbContextId'], 'not found'
                 )
             rows.append(row)
+
     return rows
 
 
@@ -727,3 +727,66 @@ def get_kb_disease_matches(
         raise ValueError(msg)
 
     return disease_matches
+
+
+def ensure_str_list(val):
+    if isinstance(val, str):
+        return [f.strip() for f in val.split(',') if f.strip()]
+    if isinstance(val, list):
+        if not all(isinstance(item, str) for item in val):
+            raise TypeError('All items in flags must be strings')
+        return val
+    raise TypeError(f'Unexpected type in flags field: {type(val).__name__}')
+
+
+def add_transcript_flags(variant_sources, transcript_flags_df):
+    lookup = dict(zip(transcript_flags_df['transcript'], transcript_flags_df['flags']))
+
+    for record in variant_sources:
+        flags_str = lookup.get(record.get('transcript'))
+        if not flags_str:
+            continue
+        # Split on commas and strip whitespace
+        new_flags = ensure_str_list(str(flags_str))
+        flags = ensure_str_list(record.setdefault('flags', []))
+        for new_flag in new_flags:
+            if new_flag not in flags:
+                flags.append(new_flag)
+        record['flags'] = flags
+
+    # fusions: check both transcripts for flags and add to the same record
+    label_map = {'ctermTranscript': 'cterm', 'ntermTranscript': 'nterm'}
+
+    for record in variant_sources:
+        flags = ensure_str_list(record.setdefault('flags', []))
+
+        for key, label in label_map.items():
+            transcript = record.get(key)
+            flags_str = lookup.get(transcript)
+            if not flags_str:
+                continue
+
+            for flag in ensure_str_list(str(flags_str)):
+                new_flag = f'{flag} ({label})'
+                if new_flag not in flags:
+                    flags.append(new_flag)
+                record['flags'] = flags
+    return variant_sources
+
+
+def get_variant_flags(variant_sources):
+    flags = []
+    for item in variant_sources:
+        raw_flags = item.get('flags')
+        if not raw_flags:  # skips None and ''
+            continue
+        # create record, removing dupes from flags list
+        flags.append(
+            {
+                'variant': item['key'],
+                'variantType': item['variantType'],
+                'flags': list(set([f for f in ensure_str_list(raw_flags) if f])),
+            }
+        )
+        item.pop('flags', None)  # remove after extraction
+    return flags
diff --git a/pori_python/ipr/main.py b/pori_python/ipr/main.py
@@ -6,6 +6,7 @@
 import jsonschema.exceptions
 import logging
 import os
+import pandas as pd
 from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
 from typing import Callable, Dict, List, Optional, Sequence, Set
 
@@ -46,6 +47,8 @@
     get_kb_disease_matches,
     get_kb_matches_sections,
     select_expression_plots,
+    get_variant_flags,
+    add_transcript_flags,
 )
 from .summary import auto_analyst_comments, get_ipr_analyst_comments
 from .therapeutic_options import create_therapeutic_options
@@ -157,6 +160,12 @@ def command_interface() -> None:
         action='store_true',
         help='True if ignore extra fields in json',
     )
+    parser.add_argument(
+        '--transcript_flags',
+        required=False,
+        type=file_path,
+        help='TSV without header, with columns: gene, transcript, comma-separated list of flags',
+    )
     args = parser.parse_args()
 
     with open(args.content, 'r') as fh:
@@ -181,6 +190,7 @@ def command_interface() -> None:
         upload_json=args.upload_json,
         validate_json=args.validate_json,
         ignore_extra_fields=args.ignore_extra_fields,
+        transcript_flags=args.transcript_flags,
     )
 
 
@@ -234,7 +244,7 @@ def clean_unsupported_content(upload_content: Dict, ipr_spec: Dict = {}) -> Dict
         for key, count in removed_keys.items():
             logger.warning(f"IPR unsupported property '{key}' removed from {count} genes.")
 
-    drop_columns = ['variant', 'variantType', 'histogramImage']
+    drop_columns = ['variant', 'variantType', 'histogramImage', 'flags']
     # DEVSU-2034 - use a 'displayName'
     VARIANT_LIST_KEYS = [
         'expressionVariants',
@@ -281,7 +291,6 @@ def clean_unsupported_content(upload_content: Dict, ipr_spec: Dict = {}) -> Dict
 
     # Removing cosmicSignatures. Temporary
     upload_content.pop('cosmicSignatures', None)
-
     return upload_content
 
 
@@ -318,6 +327,7 @@ def ipr_report(
     validate_json: bool = False,
     ignore_extra_fields: bool = False,
     tmb_high: float = TMB_SIGNATURE_HIGH_THRESHOLD,
+    transcript_flags: str = '',
 ) -> Dict:
     """Run the matching and create the report JSON for upload to IPR.
 
@@ -386,6 +396,12 @@ def ipr_report(
         logger.error('Failed schema check - report variants may be corrupted or unmatched.')
         logger.error(f'Failed schema check: {err}')
 
+    transcript_flags_df = None
+    if transcript_flags:
+        transcript_flags_df = pd.read_csv(
+            transcript_flags, sep='\t', names=['gene', 'transcript', 'flags']
+        )
+
     # INPUT VARIANTS VALIDATION & PREPROCESSING (OBSERVED BIOMARKERS)
     signature_variants: List[IprSignatureVariant] = preprocess_signature_variants(
         [
@@ -410,6 +426,7 @@ def ipr_report(
     expression_variants: List[IprExprVariant] = preprocess_expression_variants(
         content.get('expressionVariants', [])
     )
+
     # Additional checks
     if expression_variants:
         check_comparators(content, expression_variants)
@@ -459,6 +476,10 @@ def ipr_report(
         *structural_variants,
     ]  # type: ignore
 
+    # ANNOTATING VARIANTS WITH TRANSCRIPT FLAGS
+    if transcript_flags_df is not None and not transcript_flags_df.empty:
+        all_variants = add_transcript_flags(all_variants, transcript_flags_df)
+
     # GKB_MATCHES FILTERING
     if match_germline:
         # verify germline kb statements matched germline observed variants, not somatic variants
@@ -527,10 +548,24 @@ def ipr_report(
         gkb_matches, all_variants, kb_matched_sections['kbMatches']
     )
 
+    variant_sources = [
+        v
+        for source in [
+            [v for v in small_mutations if v['gene'] in genes_with_variants],
+            [v for v in copy_variants if v['gene'] in genes_with_variants],
+            [v for v in expression_variants if v['gene'] in genes_with_variants],
+            signature_variants,
+            filter_structural_variants(structural_variants, gkb_matches, gene_information),
+        ]
+        for v in source
+    ]
+    observed_vars_section = get_variant_flags(variant_sources)
+
     # OUTPUT CONTENT
     # thread safe deep-copy the original content
     output = json.loads(json.dumps(content))
     output.update(kb_matched_sections)
+
     output.update(
         {
             'copyVariants': [
@@ -550,15 +585,20 @@ def ipr_report(
                 for s in filter_structural_variants(
                     structural_variants, gkb_matches, gene_information
                 )
-            ],
+            ],  # TODO NB are we omitting non-matched sv's?
             'signatureVariants': [trim_empty_values(s) for s in signature_variants],
             'genes': gene_information,
             'genomicAlterationsIdentified': key_alterations,
             'variantCounts': variant_counts,
             'analystComments': comments,
             'therapeuticTarget': targets,
+            'observedVariantAnnotations': observed_vars_section,
         }
     )
+
+    # TODO there are 13 outliers in the test data; if even only three are matched, why are only those three
+    # shown in the expression section? shouldn't we be seeing the non-kbmatched vars there as well?
+
     output.setdefault('images', []).extend(select_expression_plots(gkb_matches, all_variants))
 
     # if input includes hrdScore field, that is ok to pass to db
@@ -577,6 +617,7 @@ def ipr_report(
         if not ipr_conn:
             raise ValueError('ipr_url required to upload report')
         ipr_spec = ipr_conn.get_spec()
+
         output = clean_unsupported_content(output, ipr_spec)
         try:
             logger.info(f'Uploading to IPR {ipr_conn.url}')

diff --git a/pori_python/types.py b/pori_python/types.py
@@ -134,11 +134,12 @@ def __hash__(self):
 
 
 class IprVariantBase(TypedDict):
-    """Required properties of all variants for IPR."""
+    """Required or possible properties of all variants for IPR."""
 
     key: str
     variantType: str
     variant: str
+    flags: Optional[List[str]]
 
 
 class IprGeneVariant(IprVariantBase):