copied project files into the PR
This commit is contained in:
@@ -1,3 +1,102 @@
|
||||
# My Contribution - Project Week 3
|
||||
# 🧠 Synthetic Data Generator
|
||||
|
||||
A Python-based tool to generate structured, synthetic job postings using open-source LLMs from Hugging Face.
|
||||
This project supports both **script-based execution** and an **interactive Colab notebook**, making it ideal for rapid prototyping, dataset bootstrapping, or demonstrating prompt engineering techniques.
|
||||
|
||||
> Note: Original Repo can be found at: https://github.com/moawiah/synthetic_data_generator
|
||||
|
||||
|
||||

|
||||
|
||||
|
||||
This tool helps:
|
||||
- Researchers create labeled training data for NLP classification or QA
|
||||
- HR tech startups prototype recommendation models
|
||||
- AI instructors demonstrate few-shot prompting in class
|
||||
|
||||
|
||||
---
|
||||
|
||||
## ✨ Features
|
||||
|
||||
- 🔗 Integrates Hugging Face Transformer models
|
||||
- 📄 Generates realistic job postings in structured JSON format
|
||||
- 🧪 Supports prompt engineering with control over output length and variability
|
||||
- 🧠 Minimal Gradio UI for non-technical users
|
||||
- 📓 Jupyter/Colab support for experimentation and reproducibility
|
||||
|
||||
## 📂 Project Structure
|
||||
<pre> ```
|
||||
. ├── app/
|
||||
│
|
||||
├── app.py # Main script entry point
|
||||
│
|
||||
├── consts.py # Configuration and constants
|
||||
│
|
||||
└── requirements.txt # Python dependencies
|
||||
├── data/
|
||||
│
|
||||
└── software_engineer_jobs.json # Sample input data (JSON format)
|
||||
├── notebooks/
|
||||
│
|
||||
└── synthetic_data_generator.ipynb # Interactive Colab notebook
|
||||
├── .env.example # Sample environment variable config
|
||||
├── .gitignore # Git ignored files list
|
||||
└── README.md
|
||||
``` </pre>
|
||||
|
||||
## 🚀 Getting Started
|
||||
|
||||
### 1. Clone the repository
|
||||
```bash
|
||||
git clone https://github.com/moawiah/synthetic_data_generator.git
|
||||
cd synthetic_data_generator
|
||||
```
|
||||
### Install Dependencies
|
||||
```bah
|
||||
pip install -r app/requirements.txt
|
||||
```
|
||||
### Hugging Face Token
|
||||
You need to create a `.env` file with your HuggingFace token like `HF_TOKEN=your-token-here`
|
||||
|
||||
### Run
|
||||
run the app using
|
||||
`python app/app.py`
|
||||
|
||||
|
||||
## Example Output - 1 Job
|
||||
|
||||
```JSON
|
||||
{
|
||||
"title": "Software Engineer"
|
||||
,
|
||||
"description": "We are seeking a highly skilled software engineer to join our team and contribute to the development of innovative software solutions. The ideal candidate will have experience in designing, coding, and testing software systems, and will be able to work collaboratively with cross-functional teams. Responsibilities include writing clean, maintainable, and efficient code, as well as actively participating in code reviews and continuous integration processes. This is an excellent opportunity for a self-starter with a passion for technology and a desire to grow in their career."
|
||||
,
|
||||
"requirements":[
|
||||
"0":"Bachelor's degree in Computer Science or related field",
|
||||
"1":"Minimum of 2 years experience in software development",
|
||||
"2":"Strong proficiency in Java or C++",
|
||||
"3":"Experience with agile development methodologies",
|
||||
"4":"Good understanding of data structures and algorithms",
|
||||
"5":"Excellent problem-solving and analytical skills"
|
||||
],
|
||||
"location":"New York, NY",
|
||||
"company_name":"ABC Technologies"
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
|
||||
## Future Improvements
|
||||
🔁 Add support for more job roles and industries
|
||||
|
||||
🧠 Model selector from UI
|
||||
|
||||
💾 Export dataset as CSV
|
||||
|
||||
☁️ Optional integration with LangChain or RAG workflows
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Here is a link to my project for this week: https://github.com/moawiah/synthetic_data_generator
|
||||
156
week3/muawiya/app/app.py
Normal file
156
week3/muawiya/app/app.py
Normal file
@@ -0,0 +1,156 @@
|
||||
import os
|
||||
import requests
|
||||
from IPython.display import Markdown, display, update_display
|
||||
from openai import OpenAI
|
||||
from google.colab import drive
|
||||
from huggingface_hub import login
|
||||
from google.colab import userdata
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig, pipeline, TextGenerationPipeline
|
||||
import torch
|
||||
from consts import FALCON, MISTRAL, Databricks
|
||||
from dotenv import load_dotenv
|
||||
import json
|
||||
import ast
|
||||
import gradio as gr
|
||||
import re
|
||||
|
||||
# Sign in to HuggingFace Hub
|
||||
load_dotenv()
|
||||
hf_token = os.getenv("HF_TOKEN")
|
||||
|
||||
|
||||
# Main Prompt
|
||||
prompt = """
|
||||
Generate one fake job posting for a {{role}}.
|
||||
|
||||
Return only a single JSON object with:
|
||||
- title
|
||||
- description (5-10 sentences)
|
||||
- requirements (array of 4-6 strings)
|
||||
- location
|
||||
- company_name
|
||||
|
||||
No explanations, no extra text.
|
||||
Only the JSON object.
|
||||
"""
|
||||
|
||||
# Main Conf
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_use_double_quant=True,
|
||||
bnb_4bit_compute_dtype=torch.bfloat16,
|
||||
bnb_4bit_quant_type="nf4"
|
||||
)
|
||||
|
||||
def load_model_and_tokenizer():
|
||||
tokenizer = AutoTokenizer.from_pretrained(MISTRAL, trust_remote_code=True)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
MISTRAL,
|
||||
device_map={"": "cuda"},
|
||||
trust_remote_code=True,
|
||||
offload_folder="/tmp/dolly_offload",
|
||||
quantization_config=bnb_config
|
||||
)
|
||||
|
||||
return model, tokenizer
|
||||
|
||||
|
||||
def generate_job(role="Software Engineer", model=None, tokenizer=None):
|
||||
# prompt = prompt.format(role=role, n=n)
|
||||
# outputs = generator(prompt, max_new_tokens=500, do_sample=True, temperature=0.9)
|
||||
# return outputs[0]['generated_text']
|
||||
|
||||
# Apply chat template formatting
|
||||
# inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)
|
||||
inputs = tokenizer(prompt.format(role=role), return_tensors="pt")
|
||||
inputs = {k: v.to(model.device) for k, v in inputs.items()}
|
||||
|
||||
|
||||
# Generate output
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=600,
|
||||
do_sample=True,
|
||||
temperature=0.2,
|
||||
top_p=0.9,
|
||||
pad_token_id=tokenizer.eos_token_id
|
||||
)
|
||||
|
||||
# Decode and return
|
||||
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
return result
|
||||
|
||||
def generate_jobs(role="Software Engineer", n=5):
|
||||
model, tokenizer = load_model_and_tokenizer()
|
||||
role = "Software Engineer"
|
||||
fake_jobs = []
|
||||
for i in range(n):
|
||||
fake_jobs.append(generate_job(role=role, model=model, tokenizer=tokenizer))
|
||||
return fake_jobs
|
||||
|
||||
def extract_json_objects_from_text_block(texts):
|
||||
"""
|
||||
Accepts either a single string or a list of strings.
|
||||
Extracts all valid JSON objects from messy text blocks.
|
||||
"""
|
||||
if isinstance(texts, str):
|
||||
texts = [texts] # wrap in list if single string
|
||||
|
||||
pattern = r"\{[\s\S]*?\}"
|
||||
results = []
|
||||
|
||||
for raw_text in texts:
|
||||
matches = re.findall(pattern, raw_text)
|
||||
for match in matches:
|
||||
try:
|
||||
obj = json.loads(match)
|
||||
results.append(obj)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
return results
|
||||
|
||||
def generate_ui(role, n):
|
||||
try:
|
||||
raw_jobs = generate_jobs(role, n)
|
||||
parsed_jobs = extract_json_objects_from_text_block(raw_jobs)
|
||||
|
||||
if not isinstance(parsed_jobs, list) or not all(isinstance(item, dict) for item in parsed_jobs):
|
||||
print("[ERROR] Parsed result is not a list of dicts")
|
||||
return gr.update(value=[], visible=True), None
|
||||
|
||||
filename = f"data/{role.replace(' ', '_').lower()}_jobs.json"
|
||||
with open(filename, "w") as f:
|
||||
json.dump(parsed_jobs, f, indent=2)
|
||||
|
||||
print(f"[INFO] Returning {len(parsed_jobs)} jobs -> {filename}")
|
||||
return parsed_jobs, filename
|
||||
|
||||
except Exception as e:
|
||||
print(f"[FATAL ERROR] {e}")
|
||||
return gr.update(value=[], visible=True), None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
with gr.Blocks() as demo:
|
||||
gr.Markdown("# 🧠 Synthetic Job Dataset Generator")
|
||||
gr.Markdown("Generate a structured dataset of job postings for a specific role.")
|
||||
|
||||
with gr.Row():
|
||||
role_input = gr.Textbox(label="Job Role", placeholder="e.g. Software Engineer", value="Software Engineer")
|
||||
n_input = gr.Number(label="Number of Samples", value=5, precision=0)
|
||||
|
||||
generate_button = gr.Button("🚀 Generate")
|
||||
output_table = gr.JSON(label="Generated Dataset")
|
||||
download_button = gr.File(label="Download JSON")
|
||||
|
||||
generate_button.click(
|
||||
generate_ui,
|
||||
inputs=[role_input, n_input],
|
||||
outputs=[output_table, download_button]
|
||||
)
|
||||
|
||||
demo.launch(debug=True, share=True)
|
||||
|
||||
|
||||
5
week3/muawiya/app/consts.py
Normal file
5
week3/muawiya/app/consts.py
Normal file
@@ -0,0 +1,5 @@
|
||||
# Models
|
||||
GPT = 'gpt2'
|
||||
FALCON = "tiiuae/falcon-rw-1b"
|
||||
MISTRAL = "mistralai/Mistral-7B-Instruct-v0.1"
|
||||
Databricks = "databricks/dolly-v2-3b"
|
||||
7
week3/muawiya/app/requirements.txt
Normal file
7
week3/muawiya/app/requirements.txt
Normal file
@@ -0,0 +1,7 @@
|
||||
huggingface_hub==0.30.2
|
||||
ipython==8.12.3
|
||||
openai==1.76.2
|
||||
protobuf==6.30.2
|
||||
Requests==2.32.3
|
||||
torch==2.6.0+cu124
|
||||
transformers==4.51.3
|
||||
71
week3/muawiya/data/software_engineer_jobs.json
Normal file
71
week3/muawiya/data/software_engineer_jobs.json
Normal file
@@ -0,0 +1,71 @@
|
||||
[
|
||||
{
|
||||
"title": "Software Engineer",
|
||||
"description": "We are seeking a highly skilled software engineer to join our team in developing and maintaining complex software systems. The ideal candidate will have a strong background in computer science and experience with multiple programming languages. Responsibilities include writing clean and efficient code, collaborating with cross-functional teams, and actively participating in code reviews. This is an excellent opportunity for a self-starter with a passion for technology and a desire to grow in their career.",
|
||||
"requirements": [
|
||||
"Bachelor's degree in Computer Science or related field",
|
||||
"3+ years of experience in software development",
|
||||
"Strong proficiency in Java or C++",
|
||||
"Experience with agile development methodologies",
|
||||
"Excellent problem-solving and analytical skills"
|
||||
],
|
||||
"location": "New York, NY",
|
||||
"company_name": "ABC Technologies"
|
||||
},
|
||||
{
|
||||
"title": "Software Engineer",
|
||||
"description": "We are looking for a highly skilled software engineer to join our team and contribute to the development of innovative software solutions. The ideal candidate will have experience in designing, developing, and testing software systems, and be able to work independently or as part of a team. Responsibilities include writing clean and efficient code, collaborating with cross-functional teams, and actively participating in code reviews. Must have a strong understanding of computer science principles and be able to learn quickly. This is a full-time position located in San Francisco, CA.",
|
||||
"requirements": [
|
||||
"Bachelor's degree in Computer Science or related field",
|
||||
"3+ years of experience in software development",
|
||||
"Strong proficiency in Java or C++",
|
||||
"Experience with agile development methodologies",
|
||||
"Excellent problem-solving skills",
|
||||
"Ability to work in a fast-paced environment"
|
||||
],
|
||||
"location": "San Francisco, CA",
|
||||
"company_name": "Acme Inc."
|
||||
},
|
||||
{
|
||||
"title": "Software Engineer",
|
||||
"description": "We are seeking a highly skilled software engineer to join our team in developing and maintaining our cutting-edge software applications. The ideal candidate will have a strong background in computer science and software engineering, with experience in designing, coding, and testing software systems. Responsibilities include collaborating with cross-functional teams, writing clean and efficient code, and ensuring the timely delivery of high-quality software products. This is an excellent opportunity for a self-starter with a passion for technology and a desire to work in a dynamic and fast-paced environment.",
|
||||
"requirements": [
|
||||
"Bachelor's degree in Computer Science or related field",
|
||||
"3+ years of experience in software engineering",
|
||||
"Strong proficiency in Java, Python, or C++",
|
||||
"Experience with agile development methodologies",
|
||||
"Excellent problem-solving and analytical skills",
|
||||
"Strong communication and interpersonal skills"
|
||||
],
|
||||
"location": "New York, NY",
|
||||
"company_name": "ABC Tech"
|
||||
},
|
||||
{
|
||||
"title": "Software Engineer",
|
||||
"description": "We are seeking a highly skilled software engineer to join our team and contribute to the development of innovative software solutions. The ideal candidate will have a strong background in computer science and experience with various programming languages and technologies. Responsibilities include designing, coding, testing, and maintaining software systems, as well as collaborating with cross-functional teams. This is an excellent opportunity for a creative and motivated individual to make a significant impact in the tech industry.",
|
||||
"requirements": [
|
||||
"Bachelor's degree in Computer Science or related field",
|
||||
"Minimum of 2 years experience in software development",
|
||||
"Strong proficiency in Java, Python, or C++",
|
||||
"Experience with agile development methodologies",
|
||||
"Excellent problem-solving and analytical skills",
|
||||
"Ability to work independently and as part of a team",
|
||||
"Strong communication and interpersonal skills"
|
||||
],
|
||||
"location": "New York, NY",
|
||||
"company_name": "ABC Tech Inc."
|
||||
},
|
||||
{
|
||||
"title": "Software Engineer",
|
||||
"description": "We are looking for a skilled software engineer to join our team and contribute to the development of innovative software solutions. Responsibilities include designing, coding, testing and maintaining software systems, as well as collaborating with cross-functional teams. The ideal candidate will have a strong background in computer science or a related field, and at least 3 years of experience in software development. Must be proficient in multiple programming languages, including Java, Python, and C++. Strong problem-solving skills and the ability to work independently or as part of a team are required. This is a full-time position located in San Francisco, CA.",
|
||||
"requirements": [
|
||||
"Bachelor's degree in Computer Science or related field",
|
||||
"At least 3 years of experience in software development",
|
||||
"Proficiency in Java, Python, and C++",
|
||||
"Strong problem-solving skills",
|
||||
"Ability to work independently or as part of a team"
|
||||
],
|
||||
"location": "San Francisco, CA",
|
||||
"company_name": "Innovative Solutions Inc."
|
||||
}
|
||||
]
|
||||
5509
week3/muawiya/notebooks/synthetic_data_generator.ipynb
Normal file
5509
week3/muawiya/notebooks/synthetic_data_generator.ipynb
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user