diff --git a/IDEAS.md b/IDEAS.md index 841fd6a..5278dfd 100644 --- a/IDEAS.md +++ b/IDEAS.md @@ -44,3 +44,15 @@ - For a few hundred samples it is OK to wait a bit. However, generating thousands of samples should be done Async/Batch as it becomes unacceptable for a user to wait so long. - Check what options LiteLLM offers. This is another reason to migrate to LiteLLM. +### Reasoning Dataset +- Simple DeepSeek distilation? +- Agentic Generation? +- We can have a look at CamelAI, but also check how other reasoning dataset are generated. + +### Tests +- There are no proper tests, just some scripts within `examples`. +- We should add proper tests soon after launch + +### EvolInstruct not implemented +- At the moment EvolInstruct is not implemented in PreferenceDataset Generation. +- While this is not critical at this stage, it would be great to have it soon. \ No newline at end of file diff --git a/datafast/datasets.py b/datafast/datasets.py index 7ade965..40cb7a2 100644 --- a/datafast/datasets.py +++ b/datafast/datasets.py @@ -17,6 +17,7 @@ TextDatasetConfig, UltraChatDatasetConfig, MCQDatasetConfig, + PreferenceDatasetConfig, ) from datafast.schema.data_rows import ( ChatRow, @@ -26,6 +27,8 @@ TextSource, MCQRow, MCQSource, + PreferenceRow, + PreferenceSource, ) from datafast.expanders import expand_prompts import os @@ -60,6 +63,14 @@ class FollowupQuestion(BaseModel): ..., description="Followup question of a user to an AI assistant response." ) +class EvolveInstructOutput(BaseModel): + improved_question: str = Field(...) + improved_answer: str = Field(...) + + +class JudgeLLMOutput(BaseModel): + assessment: str = Field(...) + score: int = Field(..., ge=1, le=10) class DatasetBase(ABC): """Abstract base class for all dataset generators.""" @@ -680,3 +691,326 @@ def _get_default_prompts(self) -> list[str]: def _get_distractor_prompt(self) -> str: """Return the prompt template for generating incorrect answers.""" return mcq_prompts.DISTRACTOR_TEMPLATE + + +class PreferenceDataset(DatasetBase): + def __init__(self, config: PreferenceDatasetConfig): + super().__init__(config) + self.config = config + + def generate(self, + question_gen_llm: LLMProvider, + chosen_response_gen_llm: LLMProvider, + rejected_response_gen_llm: LLMProvider, + evolution_llm: LLMProvider = None, + judge_llm: LLMProvider = None): + """ + Generate preference data with chosen and rejected responses. + + Args: + question_gen_llm: LLM provider for generating questions/instructions. + chosen_response_gen_llm: LLM provider for generating high-quality (chosen) responses. + rejected_response_gen_llm: LLM provider for generating lower-quality (rejected) responses. + evolution_llm: LLM provider for evolving questions and generating improved responses. + judge_llm: LLM provider for scoring responses when llm_as_judge is True. + + Raises: + ValueError: If input_documents are missing in the configuration. + """ + if not self.config.input_documents: + raise ValueError("input_documents must be provided in the configuration") + + # Get languages from config, default to English if not specified + languages = self.config.languages or {"en": "English"} + + # For each language, generate examples + for lang_code, language_name in languages.items(): + # Process each input document + for doc in self.config.input_documents: + # Generate questions for each document + questions = self._generate_questions(doc, question_gen_llm, language_name) + + # For each question, generate chosen and rejected responses + for question in questions: + # Generate chosen response + chosen_response = self._generate_chosen_response( + doc, + question, + chosen_response_gen_llm, + language_name + ) + + # Generate rejected response + rejected_response = self._generate_rejected_response( + doc, + question, + rejected_response_gen_llm, + language_name + ) + + # If evolutionary instruction is enabled, refine the instruction and response + if self.config.evol_instruct and evolution_llm: + raise NotImplementedError + # evol_result = self._evolve_question_and_answer( + # doc, + # question, + # chosen_response, + # evolution_llm + # ) + # question = evol_result.improved_question + # chosen_response = evol_result.improved_answer + + + # Initialize model IDs and judge-related variables + chosen_model_id = chosen_response_gen_llm.model_id + rejected_model_id = rejected_response_gen_llm.model_id + chosen_response_score = None + rejected_response_score = None + chosen_response_assessment = None + rejected_response_assessment = None + + # If LLM as judge is enabled, use the judge LLM to evaluate the preference pair + if self.config.llm_as_judge and judge_llm: + # Get judge scores for chosen response + chosen_response_result = self._judge_scoring( + doc, question, chosen_response, judge_llm + ) + chosen_response_score = chosen_response_result.score + chosen_response_assessment = chosen_response_result.assessment + + # Get judge scores for rejected response + rejected_response_result = self._judge_scoring( + doc, question, rejected_response, judge_llm + ) + rejected_response_score = rejected_response_result.score + rejected_response_assessment = rejected_response_result.assessment + + # Swap chosen and rejected responses based on scores if needed + # This ensures the higher-scored response is always the chosen one + if rejected_response_score > chosen_response_score: + # Swap responses + chosen_response, rejected_response = rejected_response, chosen_response + # Swap scores + chosen_response_score, rejected_response_score = rejected_response_score, chosen_response_score + # Swap assessments + chosen_response_assessment, rejected_response_assessment = rejected_response_assessment, chosen_response_assessment + # Swap model IDs + chosen_model_id, rejected_model_id = rejected_model_id, chosen_model_id + + # Create and store the preference row + row_data = { + "input_document": doc, + "question": question, + "chosen_response": chosen_response, + "rejected_response": rejected_response, + "preference_source": PreferenceSource.SYNTHETIC, + "chosen_model_id": chosen_model_id, + "rejected_model_id": rejected_model_id, + "metadata": { + "language": lang_code, + "instruction_model": question_gen_llm.model_id, + } + } + + # Add judge-related fields only if we have a judge + if self.config.llm_as_judge and judge_llm: + row_data.update({ + "chosen_response_score": chosen_response_score, + "rejected_response_score": rejected_response_score, + "chosen_response_assessment": chosen_response_assessment, + "rejected_response_assessment": rejected_response_assessment + }) + + row = PreferenceRow(**row_data) + + self.data_rows.append(row) + + # Save intermediate results + self.to_jsonl(self.config.output_file) + + return self + + def _generate_questions(self, document: str, llm: LLMProvider, language_name: str) -> list[str]: + """ + Generate questions based on the input document. + + Args: + document: The input document text. + llm: LLM provider for generating questions. + language_name: The language to generate questions in. + + Returns: + List of generated questions. + """ + # Get prompt templates + templates = self.config.question_generation_prompts or self._get_default_question_prompts() + + # Select a template randomly + template = np.random.choice(templates) + + # Format the prompt + prompt = template.format( + document=document, + num_samples=self.config.num_samples_per_prompt, + language_name=language_name + ) + + # Generate questions using structured output + response = llm.generate( + prompt=prompt, + response_format=UserQuestions + ) + + return response.questions + + def _generate_chosen_response(self, document: str, question: str, llm: LLMProvider, language_name: str) -> str: + """ + Generate a high-quality (chosen) response. + + Args: + document: The input document text. + question: The question to answer. + llm: LLM provider for generating the response. + language_name: The language to generate the response in. + + Returns: + The generated response. + """ + # Get prompt template + template = self.config.chosen_response_generation_prompt or self._get_default_chosen_response_prompt() + + # Format the prompt + prompt = template.format( + document=document, + question=question, + language_name=language_name + ) + + # Generate response + response = llm.generate( + prompt=prompt, + response_format=Answer + ) + + return response.answer + + def _generate_rejected_response(self, document: str, question: str, llm: LLMProvider, language_name: str) -> str: + """ + Generate a lower-quality (rejected) response. + + Args: + document: The input document text. + question: The question to answer. + llm: LLM provider for generating the response. + language_name: The language to generate the response in. + + Returns: + The generated response. + """ + # Get prompt template + template = self.config.rejected_response_generation_prompt or self._get_default_rejected_response_prompt() + + # Format the prompt + prompt = template.format( + document=document, + question=question, + language_name=language_name + ) + + # Generate response + response = llm.generate( + prompt=prompt, + response_format=Answer + ) + + return response.answer + + def _evolve_question_and_answer(self, document: str, question: str, answer: str, llm: LLMProvider) -> EvolveInstructOutput: + """ + Evolve the question and answer. + + Args: + document: The input document text. + question: The original question. + answer: The original answer. + llm: LLM provider for evolving the question and answer. + + Returns: + EvolveInstructOutput with improved question and answer. + """ + raise NotImplementedError + # # Get prompt template + # template = self.config.evolution_prompt or self._get_default_evolution_prompt() + + # # Format the prompt + # prompt = template.format( + # document=document, + # question=question, + # answer=answer + # ) + + # # Generate evolved question and answer + # response = llm.generate( + # prompt=prompt, + # response_format=EvolveInstructOutput, + # ) + + # return response + + def _judge_scoring(self, document: str, question: str, response: str, llm: LLMProvider) -> JudgeLLMOutput: + """ + Score a response using an LLM judge. + + Args: + document: The input document text. + question: The question. + response: The response to evaluate. + llm: LLM provider for judging. + + Returns: + JudgeLLMOutput with assessment and score. + """ + # Get prompt template + template = self.config.judge_prompt or self._get_default_judge_prompt() + + # Format the prompt + prompt = template.format( + document=document, + question=question, + response=response + ) + + # Generate score using the judge LLM + # The JudgeLLMOutput class handles validation and clipping of scores + result = llm.generate( + prompt=prompt, + response_format=JudgeLLMOutput, + ) + + return result + + def _get_default_question_prompts(self) -> list[str]: + """Return the default prompt templates for question generation.""" + from datafast.prompts import preference_prompts + return preference_prompts.QUESTION_GENERATION_TEMPLATES + + def _get_default_chosen_response_prompt(self) -> str: + """Return the default prompt template for chosen response generation.""" + from datafast.prompts import preference_prompts + return preference_prompts.CHOSEN_RESPONSE_TEMPLATE + + def _get_default_rejected_response_prompt(self) -> str: + """Return the default prompt template for rejected response generation.""" + from datafast.prompts import preference_prompts + return preference_prompts.REJECTED_RESPONSE_TEMPLATE + + def _get_default_evolution_prompt(self) -> str: + """Return the default prompt template for evolutionary instruction refinement.""" + from datafast.prompts import preference_prompts + return preference_prompts.EVOLUTION_PROMPT + + def _get_default_judge_prompt(self) -> str: + """Return the default prompt template for LLM judge scoring.""" + from datafast.prompts import preference_prompts + return preference_prompts.JUDGE_PROMPT + diff --git a/datafast/examples/test_documents.py b/datafast/examples/test_documents.py new file mode 100644 index 0000000..bd94ab1 --- /dev/null +++ b/datafast/examples/test_documents.py @@ -0,0 +1,184 @@ +TEST_DOCUMENTS = [ + """ + The **Challenger Disaster** refers to the tragic explosion of the Space Shuttle *Challenger* (OV-099) on January 28, 1986, which led to the deaths of all seven astronauts aboard. The event is one of the most significant tragedies in the history of U.S. space exploration. Here's a detailed chronology of the key events leading up to, during, and following the disaster: + +### **Pre-Launch Timeline** + +**Late 1985:** +- **Launch preparation** for the *Challenger* mission, designated **STS-51-L**, began several months in advance. The mission was a part of NASA's ongoing shuttle program, aimed at deploying satellites and conducting scientific experiments. +- The crew was selected in 1985. It included a special crew member, **Christa McAuliffe**, a high school teacher, who was chosen for the **Teacher in Space** program. McAuliffe's inclusion was meant to inspire students and the general public by having an educator aboard the mission. + +**January 27, 1986 (Day Before Launch):** +- **Unseasonably cold weather** in Florida: Temperatures dropped significantly, reaching as low as 18°F (-7.7°C), which was unusually cold for the region. Concerns about the weather affecting the shuttle's launch grew as engineers at NASA discussed the forecast. +- NASA managers were aware of the cold temperatures but decided to move forward with the launch. Engineers from **Morton Thiokol**, the contractor responsible for the solid rocket boosters (SRBs), raised concerns about the potential effects of the cold on the SRBs' **O-rings**, which sealed joints between segments of the boosters. In extreme cold, these O-rings could lose flexibility and fail to properly seal the joints, potentially leading to catastrophic consequences. Despite these warnings, NASA proceeded with the decision to launch. +""", +""" +### **January 28, 1986: The Day of the Launch** + +**7:00 AM (EST):** +- The **STS-51-L crew** (Commander Francis R. Scobee, Pilot Michael J. Smith, Mission Specialist Ronald McNair, Mission Specialist Ellison Onizuka, Payload Specialist Judith Resnik, and Teacher in Space Christa McAuliffe) arrived at **Kennedy Space Center**. They completed the pre-launch activities and were then taken to the launch pad in the **NASA crew bus**. + +**10:30 AM (EST):** +- **Countdown begins:** Despite engineers’ concerns, NASA proceeded with the final stages of the countdown. At this point, the weather was still cold, but the launch was proceeding as planned. +- The decision to launch was made after a meeting in which engineers from Morton Thiokol voiced their concerns, but the managers were eager to maintain the schedule. + +**11:38 AM (EST) – Launch:** +- **Launch of the Space Shuttle Challenger:** The shuttle lifted off from **Launch Complex 39B** at **Kennedy Space Center**, carrying seven crew members and a payload of scientific experiments, including a communications satellite. +- The shuttle’s SRBs ignited as planned, providing the necessary thrust for the vehicle. Initially, the flight appeared normal. +""", +""" +### **73 Seconds After Launch:** + +**11:39 AM (EST):** +- **Disaster strikes:** + - At **73 seconds after liftoff**, the *Challenger* exploded in a catastrophic event. A plume of smoke and debris erupted from the side of the vehicle, and the shuttle quickly broke apart in mid-air. + - The cause of the explosion was traced to the failure of the SRBs’ O-rings, which were compromised by the cold weather. The failure of these seals allowed hot gases from the boosters to escape, eventually leading to the structural failure of the SRB aft field joint attachment. This caused the SRBs to break apart, and the resulting damage led to the explosion of the **External Fuel Tank** filled with liquid oxygen and liquid hydrogen. + + - The explosion killed all seven astronauts aboard: + - **Francis R. Scobee** (Commander) + - **Michael J. Smith** (Pilot) + - **Ronald McNair** (Mission Specialist) + - **Ellison Onizuka** (Mission Specialist) + - **Judith Resnik** (Mission Specialist) + - **Christa McAuliffe** (Payload Specialist / Teacher in Space) + - **Greg Jarvis** (Payload Specialist) + + - The shuttle's crew perished, but remarkably, the crew members' families were informed almost immediately. +""", +""" +### **Post-Disaster Investigation and Findings** + +**January 28, 1986 - February 1986:** +- **Search and recovery efforts:** In the days following the disaster, the **U.S. Navy** and **NASA** conducted extensive search and recovery efforts in the Atlantic Ocean. Wreckage from the shuttle was scattered over a wide area. + +- **President Ronald Reagan's Address:** + - On the evening of January 28, 1986, President Reagan delivered a televised address to the American people. He expressed his deep sorrow over the tragedy, and his speech included the famous line: "The Challenger crew has tragically been lost." + +**February 1986 – March 1986:** +- **The Rogers Commission:** In response to the disaster, President Reagan established the **Rogers Commission** to investigate the cause of the explosion. The commission was chaired by **William P. Rogers**, a former U.S. Attorney General, and included several prominent figures such as physicist **Richard Feynman**. + + - The commission's investigation revealed that the **O-rings failure** due to cold temperatures was the primary cause of the disaster. Additionally, the commission found that **NASA's management** failed to adequately address concerns raised by engineers and contractors before the launch, prioritizing the schedule over safety considerations. + +- **Key Findings:** + - The **O-rings** were not designed to handle the extreme cold temperatures that occurred on the day of the launch. + - **Pressure to meet deadlines** and the **desire to proceed with the launch** led to NASA ignoring engineering concerns. + - The commission made several recommendations to improve safety protocols and decision-making processes within NASA to prevent a similar disaster from occurring in the future. + +**1987-1988:** +- **Changes at NASA:** As a result of the disaster and the Rogers Commission’s findings, significant changes were made in NASA’s safety and decision-making procedures. One of the major recommendations was the need for a **more rigorous review** of any potential risks before launching, and the implementation of a **more transparent process** where dissenting voices within the organization could be heard. + +- **Return to Space:** NASA’s next shuttle mission, **STS-26**, launched in September 1988, with the shuttle **Discovery**, marking the return of the space shuttle program after a 32-month hiatus. +""", +""" +### **Legacy and Impact** + +- **Memorials and Tributes:** + - The **Challenger Memorial** was established at the Kennedy Space Center Visitor Complex in honor of the astronauts who perished. + - Several educational programs, including the **Challenger Center for Space Science Education**, were created in memory of the crew, particularly Christa McAuliffe. + +- **Increased Awareness of Safety:** The Challenger disaster resulted in an intense focus on safety, engineering practices, and oversight of space exploration programs, and it contributed to changes in how NASA handles risk management. + +- **Cultural Impact:** The Challenger tragedy left a lasting impression on American society, leading to a broader understanding of the inherent risks of space exploration. The event was covered extensively in the media, and the images of the shuttle's explosion remain a powerful symbol of both the potential for human achievement and the fragility of human life. + +The **Challenger Disaster** serves as a sobering reminder of the risks involved in space exploration and has shaped space policy and safety procedures in the decades since. +""", +"""The **Apollo 11 Mission** was the first successful crewed mission to land on the Moon, marking a historic achievement for humanity. The mission's primary goal was to land a man on the Moon and safely return him to Earth, as part of NASA's Apollo program. Here is a detailed chronology of key events, focusing on the **Apollo 11 Lunar Landing** and the subsequent re-ascent with the Command Module. + +--- + +### **Pre-Launch & Early Mission Events** + +**July 16, 1969:** +- **Launch of Apollo 11**: The mission launched from **Kennedy Space Center**, Florida, at 9:32 AM EDT aboard the **Saturn V rocket**. The spacecraft consisted of three main parts: the **Command Module (CM)**, **Lunar Module (LM)**, and the **Service Module (SM)**. +- The crew included: + - **Neil Armstrong** (Commander) + - **Buzz Aldrin** (Lunar Module Pilot) + - **Michael Collins** (Command Module Pilot) + +- The rocket’s **first stage** provided the initial thrust, followed by stage separation, and the spacecraft continued its journey toward the Moon, aided by the **third stage** of the Saturn V. + +--- +""", +""" +### **Lunar Arrival and Orbit Insertion** + +**July 19, 1969 (Arrival at the Moon):** +- The Apollo 11 spacecraft successfully entered **lunar orbit** after a **Translunar Injection (TLI)** burn. +- The **Command Module (CM)**, carrying Michael Collins, remained in lunar orbit, while **Armstrong** and **Aldrin** prepared for their descent to the lunar surface in the **Lunar Module (LM)**. + +**July 20, 1969 – Lunar Descent:** + +**1:00 PM EDT (LM Separation from CM):** +- Armstrong and Aldrin entered the **Lunar Module (LM)**, called **Eagle**, and separated from the **Command Module (Columbia)**, piloted by Michael Collins. + +**1:11 PM EDT (Lunar Module Descent Begins):** +- The **Lunar Module** began its descent toward the Moon's surface. The LM was piloted by Neil Armstrong, with Buzz Aldrin assisting with systems and navigation. + +**1:25 PM EDT (Landing Site Confirmed):** +- Armstrong manually piloted the **Lunar Module**, as the original landing site was found to be unsuitable due to boulders and other hazards. Armstrong guided the LM toward a safer area on the Sea of Tranquility, a relatively flat and smooth region of the lunar surface. + +**1:46 PM EDT – Touchdown on the Moon:** +- **"The Eagle has landed."** Neil Armstrong reported the successful landing. The Lunar Module touched down on the surface of the Moon after a controlled descent. +- The crew performed a final checklist and confirmed the spacecraft's stability. + +**2:00 PM EDT – First Steps on the Moon:** +- **Neil Armstrong** became the first human to set foot on the Moon, stepping down onto the surface at 10:56 PM UTC (2:56 PM EDT), saying the famous words: **“That’s one small step for [a] man, one giant leap for mankind.”** +- **Buzz Aldrin** followed shortly, becoming the second human to walk on the lunar surface. The two spent approximately **2 hours and 15 minutes** on the surface, collecting samples, taking photographs, and deploying scientific instruments, including: + - A **seismometer** + - An **American flag** + - A **retroreflector** for laser measurements + +**2:45 PM EDT – Moonwalk Ends:** +- The astronauts returned to the Lunar Module, where they sealed the hatch and prepared to re-ascent back to lunar orbit. + +--- +""", +""" +### **Re-Ascent and Return to the Command Module** + +**4:35 PM EDT – Lunar Module Ascent:** +- After concluding their activities on the lunar surface, Armstrong and Aldrin began the **re-ascent** phase. They fired the **Lunar Module ascent stage** engine to lift off from the Moon's surface and begin their journey back to the Command Module in lunar orbit. +- **Neil Armstrong** piloted the LM during ascent, and Aldrin assisted with navigation and systems. + +**5:05 PM EDT – Docking with the Command Module:** +- After a successful ascent, the Lunar Module was rendezvoused with the **Command Module (CM)**, where **Michael Collins** was awaiting the astronauts in lunar orbit. The **Lunar Module** docked with the **Command Module Columbia**, and Armstrong and Aldrin transferred into the Command Module. + +**6:00 PM EDT – Lunar Module Separation:** +- The **Lunar Module (LM)** was jettisoned after the astronauts were safely aboard the **Command Module**. The **LM's ascent stage** remained in lunar orbit, while the Command Module with all three astronauts aboard began the journey back to Earth. + +--- +""", +""" +### **Journey Back to Earth and Splashdown** + +**July 24, 1969 – Re-entry and Splashdown:** +- After completing the trans-Earth injection burn to leave lunar orbit, Apollo 11 followed its trajectory back to Earth. + +**11:00 AM EDT – Command Module Re-entry:** +- The **Command Module** re-entered the Earth's atmosphere at **25,000 miles per hour** (about 40,000 km/h). The heat shield protected the spacecraft during re-entry, which produced a significant amount of heat and light. + +**11:45 AM EDT – Splashdown in the Pacific Ocean:** +- The Apollo 11 **Command Module** successfully splashed down in the **Pacific Ocean**, approximately **812 nautical miles** southwest of Hawaii. The spacecraft was immediately recovered by the USS **Hornet**, an aircraft carrier, which had been stationed in the recovery zone. + +- **Neil Armstrong**, **Buzz Aldrin**, and **Michael Collins** were retrieved from the spacecraft and brought aboard the USS Hornet for medical examination and quarantine. + +--- +""", +""" +### **Post-Mission:** + +**Quarantine and Celebrations:** +- The astronauts underwent a period of quarantine to ensure they had not brought back any potential lunar pathogens. Although no biological threats were found, the quarantine was a precautionary measure based on the knowledge of space travel at the time. + +- Following quarantine, the astronauts returned to the United States and participated in a **worldwide tour**, where they were hailed as heroes and celebrated by millions. They met with heads of state, including President Richard Nixon, and received numerous awards and honors. + +--- + +### **Legacy of the Apollo 11 Mission:** +- The **Apollo 11 mission** is considered one of the most significant events in human history, marking the first time humans set foot on another celestial body. The mission demonstrated the potential of space exploration and highlighted the achievements of the space program. + +- Neil Armstrong's famous words, **“That’s one small step for [a] man, one giant leap for mankind,”** have remained a defining moment in the history of human exploration. + +The success of Apollo 11 paved the way for further lunar exploration under the **Apollo program**, though no other missions landed on the Moon after Apollo 17 in 1972. Nonetheless, the legacy of Apollo 11 remains central to humanity’s aspirations in space. +""" +] \ No newline at end of file diff --git a/datafast/examples/test_preference.py b/datafast/examples/test_preference.py new file mode 100644 index 0000000..97b9fcb --- /dev/null +++ b/datafast/examples/test_preference.py @@ -0,0 +1,56 @@ +""" +Example script for generating a Preference dataset with chosen and rejected responses. +""" + +from datafast.schema.config import PreferenceDatasetConfig +from datafast.datasets import PreferenceDataset +from datafast.llms import OpenAIProvider, GoogleProvider + +from datafast.examples.test_documents import TEST_DOCUMENTS + + +def main(): + # 1. Define the configuration + config = PreferenceDatasetConfig( + input_documents=TEST_DOCUMENTS, + num_samples_per_prompt=2, # Generate 2 questions per document + languages={"en": "English", "fr": "French"}, # Generate in multiple languages + llm_as_judge=True, # Use LLM to judge and score responses + output_file="preference_test_dataset.jsonl", + ) + + # 2. Initialize LLM providers + question_gen_llm = GoogleProvider(model_id="gemini-1.5-flash") + chosen_response_gen_llm = OpenAIProvider(model_id="gpt-4o-mini") + rejected_response_gen_llm = GoogleProvider(model_id="gemini-1.5-flash") + judge_llm = OpenAIProvider(model_id="gpt-4o-mini") + + # 3. Generate the dataset + dataset = PreferenceDataset(config) + dataset.generate( + question_gen_llm=question_gen_llm, + chosen_response_gen_llm=chosen_response_gen_llm, + rejected_response_gen_llm=rejected_response_gen_llm, + judge_llm=judge_llm + ) + + # 4. Print results summary + print(f"\nGenerated {len(dataset.data_rows)} preference pairs") + print(f"Results saved to {config.output_file}") + + # 5. Display a sample of the generated data + if dataset.data_rows: + sample = dataset.data_rows[0] + print("\nSample preference pair:") + print(f"Question: {sample.question}") + print(f"Chosen model: {sample.chosen_model_id}") + print(f"Rejected model: {sample.rejected_model_id}") + if sample.chosen_response_score is not None: + print(f"Chosen response score: {sample.chosen_response_score}") + print(f"Rejected response score: {sample.rejected_response_score}") + + +if __name__ == "__main__": + from dotenv import load_dotenv + load_dotenv("secrets.env") + main() diff --git a/datafast/prompts/preference_prompts.py b/datafast/prompts/preference_prompts.py new file mode 100644 index 0000000..773ca04 --- /dev/null +++ b/datafast/prompts/preference_prompts.py @@ -0,0 +1,75 @@ +""" +Prompt templates for preference dataset generation. +""" + +# Default prompt templates for generating questions from documents +QUESTION_GENERATION_TEMPLATES = [ + "Based on the following document, generate {num_samples} clear and specific questions in {language_name} that would require detailed responses. The questions should be diverse and cover different aspects of the document content:\n\n{document}", + "You are an expert interviewer. Given this document, create {num_samples} thoughtful questions in {language_name} that would elicit detailed and informative responses. Focus on different aspects of the content:\n\n{document}", + "Generate {num_samples} questions in {language_name} based on this document that would be suitable for testing an AI assistant's ability to provide helpful, accurate, and comprehensive answers:\n\n{document}" +] + +# Prompt template for generating high-quality (chosen) responses +CHOSEN_RESPONSE_TEMPLATE = """You are an expert AI assistant known for providing exceptionally helpful, accurate, and comprehensive responses. +Given the document and question below, provide a detailed, well-structured and concise answer in {language_name}: + +DOCUMENT: +{document} + +QUESTION: +{question} + +Your response should be helpful, concise, well-organized, and directly address all aspects of the question.""" + +# Prompt template for generating lower-quality (rejected) responses +REJECTED_RESPONSE_TEMPLATE = """Provide a response in {language_name} to the following question based on the document: + +DOCUMENT: +{document} + +QUESTION: +{question}""" + +# Prompt template for evolutionary instruction refinement +EVOLUTION_PROMPT = """Your task is to evolve both the question and answer to create a more challenging and interesting version. + +ORIGINAL DOCUMENT: +{document} + +ORIGINAL QUESTION: +{question} + +ORIGINAL ANSWER: +{answer} + +First, improve the question to make it more specific, nuanced, or complex while still being answerable from the document. +Then, provide an improved answer to your evolved question that is more comprehensive, accurate, and helpful than the original. + +Your response should include both the improved question and improved answer.""" + +# Prompt template for LLM judge scoring +JUDGE_PROMPT = """You are an expert evaluator assessing the quality of responses from an AI assistant to user queries. +Rate the following response on a scale from 1 to 10, where 1 is extremely poor and 10 is excellent. + +DOCUMENT: +{document} + +QUESTION: +{question} + +RESPONSE TO EVALUATE: +{response} + +Consider these criteria in your evaluation: +- Accuracy: Does the response provide correct information based on the document? +- Completeness: Does the response address all aspects of the question? +- Clarity: Is the response well-organized and easy to understand? +- Conciseness: Is the response concise and to the point? +- Helpfulness: Would the response be genuinely useful to someone asking this question? + +Provide a brief assessment of the response, highlighting specific strengths and weaknesses. + +YOUR SCORE MUST BE AN INTEGER BETWEEN 1 AND 10 INCLUSIVE. Do not provide decimal or fractional scores. + +Format your response with your assessment followed by the score on its own line.""" + diff --git a/datafast/schema/config.py b/datafast/schema/config.py index 44e1cf1..4d41fd4 100644 --- a/datafast/schema/config.py +++ b/datafast/schema/config.py @@ -154,25 +154,25 @@ class UltraChatDatasetConfig(BaseModel): description="Optional custom prompt templates for question generation", ) - persona_question_reformulation_prompt: str = Field( + persona_question_reformulation_prompt: Optional[str] = Field( default=None, description="Optional custom prompt template to reformulate \ questions based on personas", ) - simulated_assistant_prompt: str = Field( + simulated_assistant_prompt: Optional[str] = Field( default=None, description="Optional custom prompt template for the simulated \ assistant", ) - user_system_prompt: str = Field( + user_system_prompt: Optional[str] = Field( default=None, description="Optional custom system prompt for the AI to act \ as a user", ) - user_followup_prompt: str = Field( + user_followup_prompt: Optional[str] = Field( default=None, description="Optional custom prompt template for the user's \ follow-up message", @@ -262,3 +262,82 @@ def validate_text_column(cls, v): if not v: raise ValueError("text_column is required") return v + + +class PreferenceDatasetConfig(BaseModel): + dataset_type: str = Field(default="preference_dataset") + + # Input documents + input_documents: list[str] = Field( + default_factory=list, + description="List of input documents from which questions will be generated" + ) + + num_samples_per_prompt: int = Field( + default=3, + description="Number of questions generated per persona/document pair" + ) + + question_generation_prompts: Optional[list[str]] = Field( + default=None, + description="Optional custom prompt templates for question generation", + ) + + chosen_response_generation_prompt: Optional[str] = Field( + default=None, + description="Optional custom prompt template for generation of the chosen response", + ) + + rejected_response_generation_prompt: Optional[str] = Field( + default=None, + description="Optional custom prompt template for generation of the rejected response", + ) + + output_file: str = Field( + default="preference_dataset.jsonl", + description="Path to save preference dataset results" + ) + + # Expansion config + expansion: PromptExpansionConfig = PromptExpansionConfig() + + languages: dict[str, str] = Field( + default={"en": "English"}, + description="Language ISO codes and their corresponding names", + ) + + evol_instruct: bool = Field( + default=False, + description="Whether to use evolutionary instruction refinement" + ) + + llm_as_judge: bool = Field( + default=False, + description="Whether to use an LLM as judge for preference pairs scoring" + ) + + # Conditional fields for evol_instruct + evolution_prompt: Optional[str] = Field( + default=None, + description="Prompt template for evolutionary instruction refinement (required when evol_instruct=True)" + ) + + # Conditional fields for llm_as_judge + judge_prompt: Optional[str] = Field( + default=None, + description="Prompt template for the LLM judge (required when llm_as_judge=True)" + ) + + @field_validator("evolution_prompt") + def validate_evolution_prompt(cls, v, info): + values = info.data + if values.get("evol_instruct", False) and not v: + raise ValueError("evolution_prompt is required when evol_instruct is True") + return v + + @field_validator("judge_prompt") + def validate_judge_prompt(cls, v, info): + values = info.data + if values.get("llm_as_judge", False) and not v: + raise ValueError("judge_prompt is required when llm_as_judge is True") + return v diff --git a/datafast/schema/data_rows.py b/datafast/schema/data_rows.py index 76b1494..b5fdd40 100644 --- a/datafast/schema/data_rows.py +++ b/datafast/schema/data_rows.py @@ -72,4 +72,31 @@ class MCQRow(BaseModel): mcq_source: MCQSource = MCQSource.SYNTHETIC uuid: UUID = Field(default_factory=uuid4) metadata: dict[str, str] = Field(default_factory=dict) - \ No newline at end of file + + +class PreferenceSource(str, Enum): + SYNTHETIC = "synthetic" + VERIFIED = "verified" + HUMAN = "human" + CONSENSUS = "consensus" + + +class PreferenceRow(BaseModel): + """Row for storing preference data with chosen and rejected responses.""" + + input_document: str + question: str + chosen_response: str + rejected_response: str + preference_source: PreferenceSource = PreferenceSource.SYNTHETIC + chosen_model_id: Optional[str] = None + rejected_model_id: Optional[str] = None + + # Optional judge-related fields + chosen_response_score: Optional[int] = None + rejected_response_score: Optional[int] = None + chosen_response_assessment: Optional[str] = None + rejected_response_assessment: Optional[str] = None + + uuid: UUID = Field(default_factory=uuid4) + metadata: dict[str, str] = Field(default_factory=dict) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 7546d36..e98538a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "datafast" -version = "0.0.4" +version = "0.0.5" description = "A Python package for synthetic text dataset generation" readme = "README.md" license = { file = "LICENSE" }