Source code
Revision control
Copy as Markdown
Other Tools
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import json
import os
from abc import ABC, abstractmethod
from typing import Any, Callable
import requests
class _Evaluation(ABC):
"""
The abstract base class for an evaluation that is run by mozperftest.
See python/mozperftest/mozperftest/metrics/eval.py
"""
# Include a list of requirements that will be pip installed via the test harness.
# Requirements should not require a build, and should have the appropriate .whl
# files for reproducibility.
requirements: list[str] = []
def __init__(self, log: Callable[[str], None], config: dict[str, Any]) -> None:
self.log = log
self.config = config
@abstractmethod
def run(self, payloads: list[dict[str, Any]]) -> dict:
"""Run the evaluation and return a perftest metric result."""
...
class _LlmJudge(_Evaluation):
"""
Use the Mozilla LLM Proxy Auth (MLPA) endpoint to run an LLM as a judge.
"""
def __init__(self, log: Callable[[str], None], config: dict[str, Any]) -> None:
super().__init__(log, config)
self.endpoint = config.get(
"endpoint",
)
self.model = config.get("model", "vertex_ai/mistral-small-2503")
self.token = os.environ.get("MOZ_FXA_BEARER_TOKEN")
def query_llm(self, messages: list[Any]):
if not self.token:
raise RuntimeError("Missing MOZ_FXA_BEARER_TOKEN for LLM evaluation.")
resp = requests.post(
self.endpoint,
headers={
"authorization": f"Bearer {self.token}",
"content-type": "application/json",
"service-type": "ai",
},
json={
"model": self.model,
"messages": messages,
"stream": False,
},
timeout=30,
)
resp.raise_for_status()
return resp.json()
class _TranslationsSacreBleu(_Evaluation):
"""
Compute the bleu or chrF (character level f-score) for a translation.
Use TranslationsBleu and TranslationsChrf for the respective scores.
"""
requirements = [
"sacrebleu==2.4.2",
]
name = ""
def compute_score(self, trg: str, ref: str) -> float:
raise NotImplementedError()
def run(self, payloads: list[dict[str, Any]]):
results: list[float] = []
for payload in payloads:
if "trg" not in payload or "ref" not in payload:
raise ValueError(f"Missing required translation fields in {payload}")
trg = payload["trg"]
ref = payload["ref"]
results.append(self.compute_score(trg, ref))
if not results:
raise ValueError(
"No evaluation results were produced for translation data."
)
return {
"name": self.name,
"values": results,
"lowerIsBetter": True,
}
class TranslationsBleu(_TranslationsSacreBleu):
"""See _TranslationsSacreBleu for documentation."""
name = "bleu"
def compute_score(self, trg: str, ref: str) -> float:
import sacrebleu
self.log("Computing the bleu score")
return sacrebleu.corpus_bleu([trg], [[ref]]).score
class TranslationsChrf(_TranslationsSacreBleu):
"""See _TranslationsSacreBleu for documentation."""
name = "chrF"
def compute_score(self, trg: str, ref: str) -> float:
import sacrebleu
self.log("Computing the chrF score")
return sacrebleu.corpus_chrf([trg], [[ref]]).score
class TranslationsLlmJudge(_LlmJudge):
"""
Judge a translation based on an LLM's judgement.
Returns:
{
"score": int,
"verdict": str,
"explanation": str,
"model": str,
}
perfherder_metrics: [
{
name: "bleu",
unit: "bleu",
lowerIsBetter: false,
shouldAlert: false,
},
{
name: "chrF",
unit: "chrF",
lowerIsBetter: false,
shouldAlert: false,
},
]
"""
requirements = []
def run(self, payloads: list[dict[str, Any]]):
results: list[dict[str, Any]] = []
for payload in payloads:
missing = [key for key in ("src", "trg", "ref") if key not in payload]
if missing:
raise ValueError(
f"Missing required translation fields {missing} in {payload}"
)
src = payload["src"]
trg = payload["trg"]
ref = payload["ref"]
user_prompt = (
f"Source: {src}Reference: {ref}\nHypothesis: {trg}\n"
'Return JSON with fields: score (0-100), verdict ("good"|"ok"|"bad"), explanation (short).'
)
response = self.query_llm([
{
"role": "system",
"content": "You are a translation quality judge. Rate adequacy/fluency.",
},
{"role": "user", "content": user_prompt},
])
message = response.get("choices", [{}])[0].get("message", {})
content = message.get("content", "").strip()
# Extract the JSON if it's returned with triple backticks.
if content.startswith("```"):
lines = content.splitlines()
content = "\n".join(
line for line in lines if not line.strip().startswith("```")
)
parsed = json.loads(content)
score = parsed.get("score")
if score is None:
raise ValueError(f"Missing score in LLM judge response: {parsed}")
if isinstance(score, str):
try:
score = float(score)
except ValueError as exc:
raise ValueError(
f"Invalid score value in LLM judge response: {parsed}"
) from exc
results.append({
"score": score,
"verdict": parsed.get("verdict"),
"explanation": parsed.get("explanation"),
"model": response.get("model"),
})
if not results:
raise ValueError("No evaluation results were produced for LLM judge data.")
scores = [result.get("score", 0) for result in results]
return {
"name": "llm-judge",
"values": scores,
"lowerIsBetter": False,
}