evals.py - mozsearch

Enable keyboard shortcuts

# This Source Code Form is subject to the terms of the Mozilla Public

# License, v. 2.0. If a copy of the MPL was not distributed with this

# file, You can obtain one at http://mozilla.org/MPL/2.0/.

import json

import os

from abc import ABC, abstractmethod

from typing import Any, Callable

import requests

class _Evaluation(ABC):

"""

    The abstract base class for an evaluation that is run by mozperftest.

    See python/mozperftest/mozperftest/metrics/eval.py

"""

    # Include a list of requirements that will be pip installed via the test harness.

    # Requirements should not require a build, and should have the appropriate .whl

    # files for reproducibility.

    requirements: list[str] = []

    def __init__(self, log: Callable[[str], None], config: dict[str, Any]) -> None:

        self.log = log

        self.config = config

    @abstractmethod

    def run(self, payloads: list[dict[str, Any]]) -> dict:

        """Run the evaluation and return a perftest metric result."""

...

class _LlmJudge(_Evaluation):

"""

    Use the Mozilla LLM Proxy Auth (MLPA) endpoint to run an LLM as a judge.

"""

    def __init__(self, log: Callable[[str], None], config: dict[str, Any]) -> None:

        super().__init__(log, config)

        self.endpoint = config.get(

            "endpoint",

            "https://mlpa-prod-prod-mozilla.global.ssl.fastly.net/v1/chat/completions",

        self.model = config.get("model", "vertex_ai/mistral-small-2503")

        self.token = os.environ.get("MOZ_FXA_BEARER_TOKEN")

    def query_llm(self, messages: list[Any]):

        if not self.token:

            raise RuntimeError("Missing MOZ_FXA_BEARER_TOKEN for LLM evaluation.")

        resp = requests.post(

            self.endpoint,

            headers={

                "authorization": f"Bearer {self.token}",

                "content-type": "application/json",

                "service-type": "ai",

},

            json={

                "model": self.model,

                "messages": messages,

                "stream": False,

},

            timeout=30,

        resp.raise_for_status()

        return resp.json()

class _TranslationsSacreBleu(_Evaluation):

"""

    Compute the bleu or chrF (character level f-score) for a translation.

    https://en.wikipedia.org/wiki/BLEU

    https://en.wikipedia.org/wiki/F-score

    Use TranslationsBleu and TranslationsChrf for the respective scores.

"""

    requirements = [

        "sacrebleu==2.4.2",

    name = ""

    def compute_score(self, trg: str, ref: str) -> float:

        raise NotImplementedError()

    def run(self, payloads: list[dict[str, Any]]):

        results: list[float] = []

        for payload in payloads:

            if "trg" not in payload or "ref" not in payload:

                raise ValueError(f"Missing required translation fields in {payload}")

            trg = payload["trg"]

            ref = payload["ref"]

            results.append(self.compute_score(trg, ref))

        if not results:

            raise ValueError(

                "No evaluation results were produced for translation data."

        return {

            "name": self.name,

            "values": results,

            "lowerIsBetter": True,

class TranslationsBleu(_TranslationsSacreBleu):

    """See _TranslationsSacreBleu for documentation."""

    name = "bleu"

    def compute_score(self, trg: str, ref: str) -> float:

        import sacrebleu

        self.log("Computing the bleu score")

        return sacrebleu.corpus_bleu([trg], [[ref]]).score

class TranslationsChrf(_TranslationsSacreBleu):

    """See _TranslationsSacreBleu for documentation."""

    name = "chrF"

    def compute_score(self, trg: str, ref: str) -> float:

        import sacrebleu

        self.log("Computing the chrF score")

        return sacrebleu.corpus_chrf([trg], [[ref]]).score

class TranslationsLlmJudge(_LlmJudge):

"""

    Judge a translation based on an LLM's judgement.

    Returns:

        "score": int,

        "verdict": str,

        "explanation": str,

        "model": str,

    perfherder_metrics: [

          name: "bleu",

          unit: "bleu",

          lowerIsBetter: false,

          shouldAlert: false,

},

          name: "chrF",

          unit: "chrF",

          lowerIsBetter: false,

          shouldAlert: false,

},

"""

    requirements = []

    def run(self, payloads: list[dict[str, Any]]):

        results: list[dict[str, Any]] = []

        for payload in payloads:

            missing = [key for key in ("src", "trg", "ref") if key not in payload]

            if missing:

                raise ValueError(

                    f"Missing required translation fields {missing} in {payload}"

            src = payload["src"]

            trg = payload["trg"]

            ref = payload["ref"]

            user_prompt = (

                f"Source: {src}Reference: {ref}\nHypothesis: {trg}\n"

                'Return JSON with fields: score (0-100), verdict ("good"|"ok"|"bad"), explanation (short).'

            response = self.query_llm([

                    "role": "system",

                    "content": "You are a translation quality judge. Rate adequacy/fluency.",

},

                {"role": "user", "content": user_prompt},

])

            message = response.get("choices", [{}])[0].get("message", {})

            content = message.get("content", "").strip()

            # Extract the JSON if it's returned with triple backticks.

            if content.startswith("```"):

                lines = content.splitlines()

                content = "\n".join(

                    line for line in lines if not line.strip().startswith("```")

            parsed = json.loads(content)

            score = parsed.get("score")

            if score is None:

                raise ValueError(f"Missing score in LLM judge response: {parsed}")

            if isinstance(score, str):

                try:

                    score = float(score)

                except ValueError as exc:

                    raise ValueError(

                        f"Invalid score value in LLM judge response: {parsed}"

                    ) from exc

            results.append({

                "score": score,

                "verdict": parsed.get("verdict"),

                "explanation": parsed.get("explanation"),

                "model": response.get("model"),

})

        if not results:

            raise ValueError("No evaluation results were produced for LLM judge data.")

        scores = [result.get("score", 0) for result in results]

        return {

            "name": "llm-judge",

            "values": scores,

            "lowerIsBetter": False,