Source code for scalexi.llm_evaluation.evaluate

import pandas as pd
import random
import json
import os
import openai
from openai import OpenAI
import time 
import logging
import httpx
from datetime import datetime
#
# Read logging level from environment variable
logging_level = os.getenv('LOGGING_LEVEL', 'WARNING').upper()

# Configure logging with the level from the environment variable
logging.basicConfig(
    level=getattr(logging, logging_level, logging.WARNING),  # Default to WARNING if invalid level
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Create a logger object
logger = logging.getLogger(__name__)


[docs]
class LLMEvaluation:
    """
    A class for evaluating language model responses.

    This class initializes an evaluation system for language models, particularly designed for assessing responses generated by models like GPT-3.5 Turbo. It can handle generation and evaluation of model responses, with options for customizing system prompts and handling timeouts.

    Parameters
    ----------
    model_name : str, optional
        The name of the model to be used for generating responses. Default is "gpt-3.5-turbo".
    openai_key : str, optional
        The API key for authenticating requests to OpenAI. If not provided, it will attempt to use the key from environment variables.
    generator_system_prompt : str, optional
        Custom system prompt for generating responses. If not provided, a default prompt is used.
    eval_system_prompt : str, optional
        Custom system prompt for evaluation. If not provided, a default evaluation prompt is used.
    enable_timeouts : bool, optional
        Flag to enable or disable timeouts for API requests. Default is False.
    timeouts_options : dict, optional
        A dictionary specifying timeout options. Relevant only if `enable_timeouts` is True.

    Raises
    ------
    ValueError
        If the OpenAI API key is invalid or not provided.

    Attributes
    ----------
    client : OpenAI
        The OpenAI client configured for interaction with the model.

    Examples
    --------
    >>> evaluator = LLMEvaluation(model_name="gpt-3.5-turbo", openai_key="your-api-key")
    >>> evaluator.generator_system_prompt
    "You are an intelligent assistant capable of generating responses based on prompts."

    Notes
    -----
    The `openai_key` is essential for accessing OpenAI's API. Ensure the key is valid and has appropriate permissions.
    """

    def __init__(self, model_name="gpt-3.5-turbo", openai_key=None, generator_system_prompt = None, eval_system_prompt=None, enable_timeouts= False, timeouts_options= None):
        self.model_name = model_name
        self.openai_api_key = openai_key if openai_key is not None else os.getenv("OPENAI_API_KEY")
        if not self.openai_api_key or not self.openai_api_key.startswith("sk-"):
            raise ValueError("Invalid OpenAI API key.")
        self.client = OpenAI(api_key=self.openai_api_key, max_retries=3) 
        if enable_timeouts:
            if timeouts_options is None:
                timeouts_options = {"total": 120, "read": 60.0, "write": 60.0, "connect": 10.0}
                self.client = self.client.with_options(timeout=httpx.Timeout(120.0, read=60.0, write=60.0, connect=10.0))
            else:
                self.client = self.client.with_options(timeout=httpx.Timeout(timeouts_options["total"], timeouts_options["read"], timeouts_options["write"], timeouts_options["connect"]))
        if eval_system_prompt is None:
            self.eval_system_prompt = """Your task is to compare responses generated by a model to correct responses. 
                These responses are compared to a correct response, and a single number (0.0-5.0) is generated for each model.
                Please rate the generated response on a scale of 0.0 to 5.5, with 5.0 being perfectly correct and 0.0 being completely off. 
                The output should be a JSON with score based on the prompt,            
                Also justify your scoring in 12 words. The score_reason should not exceed 12 words and does not include any special characters. "
                "The output must be in VALID JSONL format in one line."
                score_reason should not exceed 12 words and does not include any special characters
                Eg: {
                \"score\": 4.25,
                \"score_reason\": \"12 words reasons without special chars\"
                }"""
        else:
            self.eval_system_prompt = eval_system_prompt

        if generator_system_prompt is None:
            self.generator_system_prompt = """You are an intelligent assistant capable of generating responses based on prompts."""
        else:
            self.generator_system_prompt = generator_system_prompt


[docs]
    def save_dict_list_to_csv(self, data, output_file_path=None, output_folder='csv'):
        """
        Converts a list of conversation data into a CSV file, categorizing data into columns for system prompts, user prompts, and assistant completions.

        :method save_dict_list_to_csv: Process and save conversation data in a structured CSV format.
        :type save_dict_list_to_csv: method

        :param data: A list of dictionaries, each representing a conversation with messages categorized by roles ('system', 'user', 'assistant') and their respective content.
        :type data: list

        :param output_file_path: The file path for the output CSV file. Defaults to None, which uses a default filename.
        :type output_file_path: str, optional

        :param output_folder: The directory to save the output CSV file. Defaults to 'csv'.
        :type output_folder: str, optional

        :return: None. This method does not return anything but saves the processed data to a CSV file.
        :rtype: None

        :raises Exception: If any error occurs during the processing or file writing.

        :example:

        ::

            >>> data = [{'messages': [{'role': 'system', 'content': 'System message'}, {'role': 'user', 'content': 'User question'}, {'role': 'assistant', 'content': 'Assistant answer'}]}]
            >>> save_dict_list_to_csv(data, output_file_path='output.csv')
            # This will process the provided data and save it as 'output.csv' in the specified output folder.

        :notes:
        - The input data should be formatted correctly, with each conversation's messages having designated roles ('system', 'user', 'assistant').
        """
        # Check if output folder exists, create if not
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)
        try:
            # Process data to extract prompts and completions
            processed_data = []
            for conversation in data:
                system_prompt = ""
                user_prompt = ""
                assistant_completion = ""

                for message in conversation['messages']:
                    if message['role'] == 'system':
                        system_prompt = message['content']
                    elif message['role'] == 'user':
                        user_prompt = message['content']
                    elif message['role'] == 'assistant':
                        assistant_completion = message['content']

                processed_data.append({
                    'system_prompt': system_prompt,
                    'prompt': user_prompt,
                    'completion': assistant_completion
                })


            # Create DataFrame and Save to CSV
            df = pd.DataFrame(processed_data)
            df.to_csv(output_file_path, index=False)

        except Exception as e:
            raise Exception(f"An error occurred during CSV conversion: {e}")




[docs]
    def read_jsonl(self, file_path):
        """
        Reads a JSONL (JSON Lines) file and returns the data as a list of dictionaries.

        This method is designed to read and parse data from a JSONL file, where each line of the file is a separate JSON object. 
        It is particularly useful for processing datasets stored in the JSONL format, commonly used in data processing and machine learning tasks.

        :param file_path: The path to the JSONL file to be read.
        :type file_path: str

        :return: A list of dictionaries, each representing a JSON object from a line in the JSONL file.
        :rtype: List[dict]

        :raises FileNotFoundError: If the specified file does not exist.
        :raises json.JSONDecodeError: If any line in the file is not a valid JSON object.

        :example:

        ::

            >>> file_path = 'data.jsonl'
            >>> data = read_jsonl(file_path)
            >>> print(data[0])  # Display the first JSON object from the list.
        """

        with open(file_path, 'r') as file:
            return [json.loads(line) for line in file]


    
    

[docs]
    def save_random_prompts(self, input_file, output_file = None, output_format='csv', n_samples=100, output_folder='output'):
        """
        Selects random prompts from a given file and saves them in the specified format.

        :method save_random_prompts: Extract random samples from a data file and save them in a specified format.
        :type save_random_prompts: method

        :param input_file: The path to the input file, which can be in CSV, JSON, or JSONL format.
        :type input_file: str

        :param output_file: The base name of the output file without extension. If None, a name with a timestamp and the number of samples will be generated. Defaults to None.
        :type output_file: str, optional

        :param output_format: The format for the output file, which can be 'csv', 'json', or 'jsonl'. Defaults to 'csv'.
        :type output_format: str, optional

        :param n_samples: The number of random samples to select from the input file. Defaults to 100.
        :type n_samples: int, optional

        :param output_folder: The folder where the output file should be saved. Defaults to 'output'.
        :type output_folder: str, optional

        :return: None. This method does not return anything but saves the extracted samples to a file.
        :rtype: None

        :raises ValueError: If the input file format is unsupported or if the output format is not one of 'csv', 'json', or 'jsonl'.
        :raises Exception: If any other error occurs during the processing or file writing.

        :example:

        ::

            >>> save_random_prompts("data.csv", output_file="sample_prompts", output_format='csv', n_samples=50, output_folder='output')
            # This will select 50 random prompts from 'data.csv' and save them as 'sample_prompts_[timestamp]_50.csv' in the 'output' folder.

        :notes:
        - Ensure that the input file is in one of the supported formats (CSV, JSON, or JSONL) for correct processing.
        """

        try:
            # Check if output folder exists, create if not
            if not os.path.exists(output_folder):
                os.makedirs(output_folder)

            # Read data
            if input_file.endswith('.csv'):
                data = pd.read_csv(input_file)
                data_dict = data.to_dict(orient='records')
            elif input_file.endswith('.json'):
                data_dict = pd.read_json(input_file, orient='records').to_dict(orient='records')
            elif input_file.endswith('.jsonl'):
                data_dict = self.read_jsonl(input_file)
            else:
                raise ValueError("Unsupported file format. Please provide a CSV, JSON, or JSONL file.")

            # Select random samples
            random_samples = random.sample(data_dict, min(n_samples, len(data_dict)))

            if output_file == None:
                output_file = "random_prompts"
                # Construct output file path with timestamp
                current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
                output_file_name = f"{output_file}_{current_time}_{n_samples}.{output_format}"
                output_file_path = os.path.join(output_folder, output_file_name)
            else:
                output_file, _ = os.path.splitext(output_file)
                # Construct output file path with timestamp
                current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
                output_file_name = f"{output_file}_{current_time}_{n_samples}.{output_format}"
                output_file_path = os.path.join(output_folder, output_file_name)

            # Save data in the desired format
            if output_format == 'csv':
                if input_file.endswith('.jsonl') or input_file.endswith('.json'):
                    self.save_dict_list_to_csv(random_samples, output_file_path)
                else:
                    df = pd.DataFrame(random_samples)  # This creates a DataFrame with columns based on the keys in the dictionaries
                    df.to_csv(output_file_path, index=False)  # Saves the DataFrame to a CSV file without the index column
            elif output_format == 'json':
                with open(output_file_path, 'w') as file:
                    json.dump(random_samples, file, indent=4)
            elif output_format == 'jsonl':
                with open(output_file_path, 'w') as file:
                    for item in random_samples:
                        file.write(json.dumps(item) + '\n')
            else:
                raise ValueError("[save_random_prompts] Unsupported output format. Please choose 'csv', 'json', or 'jsonl'.")
        except Exception as e:
            raise Exception (f"[save_random_prompts] Error: {e}")

        



[docs]
    def rephrase_and_optionally_classify(self, prompt, 
                                         model_name="gpt-4", 
                                        classify=False, classes=None,
                                        prompt_style='student-asking', 
                                        temperature=1, max_tokens=256, 
                                        top_p=1, 
                                        frequency_penalty=0, 
                                        presence_penalty=0):

        """
        Rephrases a given prompt and optionally classifies it using a specified language model.

        This method takes a sentence and rephrases it using the specified language model. It can also classify 
        the rephrased sentence into provided categories, if classification is requested.

        :param prompt: The original sentence that needs to be rephrased.
        :type prompt: str
        :param model_name: The name of the language model to use, defaults to 'gpt-4'.
        :type model_name: str, optional
        :param classify: Indicates whether to classify the rephrased sentence, defaults to False.
        :type classify: bool, optional
        :param classes: The list of classification categories, used if classify is True, defaults to None.
        :type classes: list of str, optional
        :param prompt_style: The style for rephrasing the prompt, used if classify is False, defaults to 'student-asking'.
        :type prompt_style: str, optional
        :param temperature: Controls the randomness of the output, defaults to 1.
        :type temperature: float, optional
        :param max_tokens: The maximum number of tokens to generate, defaults to 256.
        :type max_tokens: int, optional
        :param top_p: Nucleus sampling parameter, defaults to 1.
        :type top_p: float, optional
        :param frequency_penalty: Adjusts frequency of token usage, defaults to 0.
        :type frequency_penalty: float, optional
        :param presence_penalty: Adjusts presence of tokens, defaults to 0.
        :type presence_penalty: float, optional

        :return: A tuple containing the rephrased prompt and its classification (or None if not classified).
        :rtype: tuple

        :raises ValueError: If unable to parse the model response as JSON.
        :raises Exception: If an error occurs during the API request or processing.

        :example:

        ::

            >>> prompt = "What is AI?"
            >>> rephrased, classification = rephrase_and_optionally_classify(prompt, classify=True, classes=["ACADEMIC", "RESEARCH"])
            >>> print(f"Rephrased: {rephrased}, Classification: {classification}")
            # Outputs the rephrased sentence and its classification.
        """
        try:
            
            # Default classes if not provided
            if classes is None:
                classes = ["ACADEMIC", "RESEARCH", "ADMIN", "SCIENCE", "OTHERS", "BUSINESS", "TECHNOLOGY", "HEALTH", "ENTERTAINMENT", "SPORTS"]
            
            classes_str = ", ".join(classes)

            if classify:
                system_message = f"You are a helpful assistant to rephrase and classify sentences as {classes_str}."
                user_message = f"Please rephrase: '{prompt}' and classify it into one of the categories: {classes_str}. Output format: '{{\"rephrased_prompt\" : \"value\", \"classification\" : \"value\"}}'"
            else:
                system_message = f"You are a helpful assistant that helps rephrase sentences in a '{prompt_style}' style."
                user_message = f"Please rephrase the following sentence: '{prompt}'. Output format: '{{\"rephrased_prompt\" : \"value\"}}'"


            response = self.client.chat.completions.create(
                model=model_name,
                messages=[
                    {"role": "system", "content": system_message},
                    {"role": "user", "content": user_message}
                ],
                temperature=temperature,
                max_tokens=max_tokens,
                top_p=top_p,
                frequency_penalty=frequency_penalty,
                presence_penalty=presence_penalty,
                stop=["\n"]
                )

            logger.debug(response.choices[0].message.content, "\n")
            model_response = response.choices[0].message.content

            if classify:
                response_data = json.loads(model_response)
                return response_data["rephrased_prompt"], response_data["classification"]
            else:
                logger.debug(model_response)
                response_data = json.loads(model_response)
                logger.debug(response_data)
                return response_data["rephrased_prompt"], None

        except json.JSONDecodeError:
            raise ValueError("Failed to parse model response as JSON.")
        except Exception as e:
            raise Exception(f"An unexpected error occurred: {e}")




[docs]
    def rephrase_and_classify_prompts_in_dataset(self, input_csv, output_csv, model_name="gpt-3.5-turbo", classify=False, classes=None):
        """
        Processes and classifies prompts from an input CSV file and saves the results to an output CSV file.

        :method rephrase_and_classify_prompts_in_dataset: Process prompts from an input CSV file, potentially classify them, and save the results in another CSV file.
        :type rephrase_and_classify_prompts_in_dataset: method

        :param input_csv: The path to the input CSV file containing prompts and their corresponding completions.
        :type input_csv: str

        :param output_csv: The path to the output CSV file where the processed data will be saved.
        :type output_csv: str

        :param model_name: The name of the language model to use for rephrasing prompts. Default is 'gpt-3.5-turbo'.
        :type model_name: str, optional

        :param classify: Flag indicating whether to classify the rephrased prompts. Default is False.
        :type classify: bool, optional

        :param classes: The list of classification categories to be used if classification is enabled. Default is None.
        :type classes: list of str, optional

        :return: None. The method does not return anything but saves the processed data to the specified output CSV file.
        :rtype: None

        :raises FileNotFoundError: If the input CSV file is not found.
        :raises Exception: For any other exceptions that may occur during the processing or file writing.

        :example:

        ::

            >>> rephrase_and_classify_prompts_in_dataset("input_prompts.csv", "processed_prompts.csv", classify=True, classes=["class1", "class2"])
            # This will read prompts from 'input_prompts.csv', process and optionally classify them, and save the results to 'processed_prompts.csv'.

        :notes:
        - The method expects the input CSV to have columns named 'prompt' and 'completion'.
        - Classification is optional and is performed only if the 'classify' parameter is set to True.
        """

        # Create the output folder if it doesn't exist
        output_folder = os.path.dirname(output_csv)
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)

        # Load input data
        try:
            df_input = pd.read_csv(input_csv)
        except FileNotFoundError as e:
            logger.error(f"The file {input_csv} was not found: {e}")
            return
        except Exception as e:
            logger.error(f"Error reading input CSV file: {e}")
            return

        # Creating a DataFrame for the output
        df_output = pd.DataFrame(columns=['initial_prompt', 'prompt', 'ground_truth_completion', 'classification'])

        for index, row in df_input.iterrows():
            try:
                initial_prompt = row['prompt']
                completion = row['completion']

                # Use the rephrase_and_optionally_classify method
                rephrased_prompt, classification = self.rephrase_and_optionally_classify(initial_prompt, model_name=model_name, classify=classify, classes=classes)
                #logger.debug(f"rephrased_prompt: {rephrased_prompt}")
                #logger.debug(f"classification: {classification}")
                new_row = {
                    "initial_prompt": initial_prompt,
                    "prompt": rephrased_prompt,
                    "ground_truth_completion": completion,
                    "classification": str(classification)
                }
                # Using pd.concat instead of append
                df_output = pd.concat([df_output, pd.DataFrame([new_row])], ignore_index=True)

                # To avoid hitting rate limits
                time.sleep(0.5)

            except Exception as e:
                logger.error(f"Error processing row {index}: {e}")

        # Save to CSV
        try:
            df_output.to_csv(output_csv, index=False)
        except Exception as e:
            logger.error(f"Error writing to output CSV file: {e}")



[docs]
    def get_generated_completion(self, 
                                 finetuned_model, 
                                 prompt, 
                                 temperature=1.0, 
                                 max_tokens=256, 
                                 top_p=1.0, 
                                 finetuned_model_start_sequence_only_for_base_models=""):
        """
        Retrieves the generated completion from a specified model based on the given prompt.

        This method interacts with the OpenAI API to generate a completion based on the provided prompt and model parameters. 
        It is designed to work with both base and fine-tuned models, offering various customization options for the generation process.

        :param finetuned_model: The name of the fine-tuned or base model to use for generating the completion.
        :type finetuned_model: str
        :param prompt: The input prompt to which the model generates a completion.
        :type prompt: str
        :param temperature: The sampling temperature, controlling the randomness of the output. Defaults to 1.0.
        :type temperature: float, optional
        :param max_tokens: The maximum number of tokens to generate in the response. Defaults to 256.
        :type max_tokens: int, optional
        :param top_p: The top-p sampling parameter, controlling the range of token probabilities considered for sampling. Defaults to 1.0.
        :type top_p: float, optional
        :param finetuned_model_start_sequence_only_for_base_models: A start sequence used only for base models, if applicable.
        :type finetuned_model_start_sequence_only_for_base_models: str, optional

        :return: The generated completion as a string.
        :rtype: str

        :raises ValueError: If an unknown model name is specified.
        :raises Exception: If there is an error during the generation process or with the OpenAI API interaction.

        :example:

        ::

            >>> finetuned_model = 'gpt-3.5-turbo'
            >>> prompt = 'Translate the following English text to French: Hello, how are you?'
            >>> completion = get_generated_completion(finetuned_model, prompt)
            >>> print(completion)
            # Outputs the model-generated translation of the prompt.
        """
        
        #logger.debug('eval_system_prompt', self.eval_system_prompt)
        try:
            if "gpt" in finetuned_model:
                response = self.client.chat.completions.create(
                    model=finetuned_model,
                    messages=[
                        {"role": "system", "content":  self.generator_system_prompt},
                        {"role": "user", "content": prompt}
                    ],
                    temperature=temperature,
                    max_tokens=max_tokens,
                    top_p=top_p,
                    frequency_penalty=0,
                    presence_penalty=0,
                    stop=["END"]
                )
                return response.choices[0].message.content 

            else:
                raise ValueError(f"Unknown model: {finetuned_model}")

        except Exception as e:
            raise Exception(f"Error in get_generated_completion: {e}")

        

[docs]
    def score_response(self, prompt, ground_truth_completion, generated_completion, llm_evaluator_model_name="gpt-3.5-turbo"):
        """
        Generates a scoring prompt, sends it to a language model, and parses the response to evaluate a given generated completion against a ground truth.

        :method score_response: Evaluate the generated completion of a prompt against the ground truth completion using a large language model.
        :type score_response: method

        :param prompt: The original prompt used in generating the completion.
        :type prompt: str

        :param ground_truth_completion: The correct or expected response to the prompt.
        :type ground_truth_completion: str

        :param generated_completion: The response generated by the evaluated model.
        :type generated_completion: str

        :param llm_evaluator_model_name: Name of the large language model to be used for scoring. Defaults to "gpt-3.5-turbo".
        :type llm_evaluator_model_name: str, optional

        :return: A tuple containing the score (numeric) and the reasoning behind the score (string).
        :rtype: tuple

        :raises json.JSONDecodeError: If the response from the model is not in a valid JSON format.
        :raises Exception: For any other exceptions that may occur during API calls or processing.

        :example:

        ::

            >>> score, reason = score_response("What is AI?", "Artificial Intelligence", "AI is a field in computer science.", "gpt-3.5-turbo")
            # This evaluates the generated completion for accuracy and relevance, returning a score and reasoning.

        :notes:
        - The method constructs an evaluation prompt combining the original prompt, ground truth completion, and the generated completion.
        - It then sends this prompt to the specified language model for scoring and parses the model's response to extract the score and reasoning.
        """

        #system_prompt = "You are an intelligent assistant capable of evaluating responses based on accuracy and relevance."

        input_prompt = (
            f"The correct response to the prompt '{prompt}' was: '{ground_truth_completion}'. "
            f"The response generated by the test model was: '{generated_completion}'. "

        )

        try:
            response = self.client.chat.completions.create(
                model=llm_evaluator_model_name,
                messages=[
                    {"role": "system", "content": self.eval_system_prompt},
                    {"role": "user", "content": input_prompt},
                    {"role": "assistant", "content": ""}
                ],
            )
            response_content = json.loads(response.choices[0].message.content)
            score = response_content["score"]
            score_reason = response_content["score_reason"]

            return score, score_reason

        except json.JSONDecodeError:
            raise json.JSONDecodeError("Failed to parse model response as JSON.")
        except Exception as e:
            raise Exception(f"An unexpected error occurred: {e}")

        

[docs]
    def evaluate_model(self, finetuned_model, dataset_csv_path, results_csv_path, 
                       temperature=1.0, max_tokens=150, top_p=1.0, 
                       frequency_penalty=0, presence_penalty=0,
                       model_start_sequence="", 
                       llm_evaluator_model_name="gpt-3.5-turbo", dataset_size=100,
                       finetuned_model_start_sequence_only_for_base_models="",
                       experiment_id=1,
                       save_immediately=False):
        """
        Evaluates the performance of a specified model using a dataset and generates statistical insights.

        This method assesses a model's ability to respond to prompts, comparing generated responses with expected completions. 
        It provides a systematic approach to evaluate model performance, saving the results for analysis.

        :param finetuned_model: The name of the model to be evaluated.
        :type finetuned_model: str
        :param dataset_csv_path: Path to the CSV file containing prompts and expected completions for evaluation.
        :type dataset_csv_path: str
        :param results_csv_path: Path where the evaluation results will be saved.
        :type results_csv_path: str
        [Other parameters...]

        :return: None. The evaluation results are saved to the specified CSV file.
        :rtype: None

        :raises FileNotFoundError: If the dataset CSV file is not found.
        :raises ValueError: If the dataset CSV file is not properly formatted or missing required columns.
        :raises Exception: For other exceptions that may occur during the evaluation process.

        :example:

        ::

            >>> finetuned_model = 'gpt-3.5-turbo'
            >>> dataset_csv_path = 'path/to/dataset.csv'
            >>> results_csv_path = 'path/to/results.csv'
            >>> evaluate_model(finetuned_model, dataset_csv_path, results_csv_path)
            # This will evaluate the model using the dataset and save results to the specified path.
        """
        if not os.path.exists(dataset_csv_path):
            raise FileNotFoundError(f"[evaluate_model] The input CSV file {dataset_csv_path} was not found.")

        # Check if the file extension is .csv
        if not dataset_csv_path.endswith('.csv'):
          raise ValueError(f"[evaluate_model] The input file {dataset_csv_path} is not a CSV file. Make sure that input_csv of the dataset to evaluate is in a CSV Format.")

        

        try:
            df_input = pd.read_csv(dataset_csv_path)

            # Required columns
            required_columns = ['initial_prompt', 'prompt', 'ground_truth_completion', 'classification']

            # Check if all required columns are present
            if not all(column in df_input.columns for column in required_columns):
                missing_columns = [column for column in required_columns if column not in df_input.columns]
                raise ValueError(f"The CSV file is missing the following required columns: {', '.join(missing_columns)}")

            df_input = pd.read_csv(dataset_csv_path)
            all_frames = []

            for index, row in df_input.iterrows():
                prompt = row["prompt"]
                ground_truth_completion = row["ground_truth_completion"]
                logger.debug(prompt)
                generated_completion = self.get_generated_completion(finetuned_model, prompt, temperature, max_tokens, top_p, 
                                                                     finetuned_model_start_sequence_only_for_base_models)
                #logger.debug("*******> generated_completion: ", generated_completion)
                # Assuming the existence of a scoring function here
                score, scoring_reason = self.score_response(prompt, ground_truth_completion, generated_completion)

                # Get the current timestamp
                current_time = datetime.now()
                timestamp_str = current_time.strftime("%Y%m%d%H%M%S")

                # Add the timestamp to the output_data dictionary
                output_data = {
                    "Timestamp": timestamp_str,
                    "Prompt": prompt,
                    "Ground Truth": row["ground_truth_completion"],
                    "Generated Response": generated_completion,
                    "Temperature": temperature,
                    "Max Tokens": max_tokens,
                    "Top P": top_p,
                    "Frequency Penalty": frequency_penalty,
                    "Presence Penalty": presence_penalty,
                    "Score": score,
                    "Classification": row["classification"],
                    "Model Name": finetuned_model,
                    "Dataset Size": dataset_size,
                    "GPT Eval Model": llm_evaluator_model_name,
                    "Scoring Reason": scoring_reason,
                    "Experiment_ID": experiment_id
                }

                # Append the dictionary to the list
                all_frames.append(output_data)

                # Print for debugging
                
                # Corrected logger.debug statement:
                logger.debug("row[%s] Score: %s Prompt: %s %s", index, score, prompt[:50], finetuned_model)

                # Similarly, update other logger.debug statements
                logger.debug("ground_truth_completion: %s", ground_truth_completion[:100])
                logger.debug("generated_completion: %s", generated_completion[:100])
                logger.debug("scoring: %s", score)
                logger.debug("scoring_reason: %s", scoring_reason)
                # Append the dictionary to the list

                # Save the data immediately if specified
                if save_immediately:
                    # Check if the output CSV file already exists
                    file_exists = os.path.exists(results_csv_path)
                    # Append the new data to the CSV file
                    mode = 'a' if file_exists else 'w'
                    header = not file_exists  # Write header only if file doesn't exist
                    pd.DataFrame([output_data]).to_csv(results_csv_path, mode=mode, header=header, index=False)                

                # To avoid hitting rate limits
                time.sleep(1.0)  

            if not save_immediately:
                # Create a DataFrame from the list of dictionaries
                df = pd.DataFrame(all_frames)
                # Save the DataFrame to a CSV file
                df.to_csv(results_csv_path, index=False)

            # Generate statistics and save them
            #self.save_stats(final_df, psugpt_model)

        except pd.errors.ParserError as e:
          raise ValueError(f"The file {dataset_csv_path} could not be parsed as CSV: {e}")

        except openai.APIConnectionError as e:                            
                logger.error(f"[evaluate_model] APIConnectionError error:\n{e}")

        except openai.RateLimitError as e:
            # If the request fails due to rate error limit, increment the retry counter, sleep for 0.5 seconds, and then try again
            logger.error(f"[evaluate_model] RateLimit Error {e}. Trying again in 0.5 seconds...")

        except openai.APIStatusError as e:
            logger.error(f"[evaluate_model] APIStatusError:\n{e}")
            # If the request fails due to service unavailability, sleep for 10 seconds and then try again without incrementing the retry counter

        except AttributeError as e:            
            logger.error(f"[evaluate_model] AttributeError:\n{e}")
            # You can also add additional error handling code here if needed

        except Exception as e:
            raise Exception(f"An error occurred during model evaluation: {e}")