diff --git a/runner/index.ts b/runner/index.ts index 3c0e1f48..3abbf8d0 100644 --- a/runner/index.ts +++ b/runner/index.ts @@ -49,3 +49,5 @@ export {NoopProgressLogger} from './progress/noop-progress-logger.js'; export {TextProgressLogger} from './progress/text-progress-logger.js'; export {type ServeTestingResult} from './workers/serve-testing/worker-types.js'; export {replaceAtReferencesInPrompt} from './utils/prompt-at-references.js'; +export {extractRubrics, type RubricInfo} from './utils/extract-rubrics.js'; +export {combineReports} from './utils/combine-reports.mjs'; diff --git a/runner/utils/combine-reports.mts b/runner/utils/combine-reports.mts new file mode 100644 index 00000000..e39ba062 --- /dev/null +++ b/runner/utils/combine-reports.mts @@ -0,0 +1,52 @@ +import assert from 'assert'; +import {RunGroup, RunInfo} from '../shared-interfaces.js'; +import {groupSimilarReports} from '../orchestration/grouping.js'; + +/** + * Takes a list of individual WCS reports and combines + * them into a single WCS group with combined run. + */ +export function combineReports( + runs: RunInfo[], + groupId: string, + runId: string, +): { + group: RunGroup; + runInfo: RunInfo; +} { + assert.notEqual(runs.length, 0, 'Expected more than zero reports.'); + + const combinedRuns = groupSimilarReports( + runs.map(r => { + return {...r, group: groupId} satisfies RunInfo; + }), + ); + assert.equal(combinedRuns.length, 1); + + const combinedRun = combinedRuns[0]; + const singleSampleRun = runs[0]; + const runInfo: RunInfo = { + id: runId, + group: combinedRun.id, + results: runs.map(r => r.results).flat(), + version: singleSampleRun.version, + details: { + reportName: singleSampleRun.details.reportName, + summary: { + displayName: singleSampleRun.details.summary.displayName, + environmentId: singleSampleRun.details.summary.environmentId, + framework: singleSampleRun.details.summary.framework, + model: singleSampleRun.details.summary.model, + usage: singleSampleRun.details.summary.usage, + }, + systemPromptGeneration: '', + systemPromptRepair: '', + timestamp: singleSampleRun.details.timestamp, + }, + }; + + return { + group: combinedRun, + runInfo, + }; +} diff --git a/runner/utils/extract-rubrics.ts b/runner/utils/extract-rubrics.ts new file mode 100644 index 00000000..c87cfe76 --- /dev/null +++ b/runner/utils/extract-rubrics.ts @@ -0,0 +1,44 @@ +import {IndividualAssessmentState, RunInfo} from '../shared-interfaces.js'; + +export interface RubricInfo { + score: number; +} + +export function extractRubrics(run: RunInfo): Record { + const rubricsAnalysis: Record = {}; + + for (const app of run.results) { + for (const category of app.score.categories) { + for (const check of category.assessments) { + if (check.state === IndividualAssessmentState.SKIPPED) { + continue; + } + + for (const label of check.groupingLabels ?? []) { + if (!rubricsAnalysis[label]) { + rubricsAnalysis[label] = {scores: []}; + } + + const checkWeightWithPillar = + category.maxPoints * (parseFloat(check.scoreReduction) / 100); + + rubricsAnalysis[label]!.scores.push({ + value: checkWeightWithPillar * check.successPercentage, + weight: checkWeightWithPillar, + }); + } + } + } + } + + const rubricsBreakdown: Record = {}; + for (const label in rubricsAnalysis) { + const scores = rubricsAnalysis[label]!.scores; + const numerator = scores.reduce((sum, score) => sum + score.value, 0); + const denominator = scores.reduce((sum, score) => sum + score.weight, 0); + rubricsBreakdown[label] = { + score: numerator / denominator, + }; + } + return rubricsBreakdown; +}