Merge pull request #402 from moawiah/my_cont_week3

contribution for week 3
This commit is contained in:
Ed Donner
2025-06-14 19:30:07 -04:00
committed by GitHub
6 changed files with 5850 additions and 0 deletions

View File

@@ -0,0 +1,102 @@
# 🧠 Synthetic Data Generator
A Python-based tool to generate structured, synthetic job postings using open-source LLMs from Hugging Face.
This project supports both **script-based execution** and an **interactive Colab notebook**, making it ideal for rapid prototyping, dataset bootstrapping, or demonstrating prompt engineering techniques.
> Note: Original Repo can be found at: https://github.com/moawiah/synthetic_data_generator
![Demo Screenshot](https://github.com/user-attachments/assets/c0e229ac-ddb7-4a37-8088-f04ca735cd81)
This tool helps:
- Researchers create labeled training data for NLP classification or QA
- HR tech startups prototype recommendation models
- AI instructors demonstrate few-shot prompting in class
---
## ✨ Features
- 🔗 Integrates Hugging Face Transformer models
- 📄 Generates realistic job postings in structured JSON format
- 🧪 Supports prompt engineering with control over output length and variability
- 🧠 Minimal Gradio UI for non-technical users
- 📓 Jupyter/Colab support for experimentation and reproducibility
## 📂 Project Structure
<pre> ```
. ├── app/
├── app.py # Main script entry point
├── consts.py # Configuration and constants
└── requirements.txt # Python dependencies
├── data/
└── software_engineer_jobs.json # Sample input data (JSON format)
├── notebooks/
└── synthetic_data_generator.ipynb # Interactive Colab notebook
├── .env.example # Sample environment variable config
├── .gitignore # Git ignored files list
└── README.md
``` </pre>
## 🚀 Getting Started
### 1. Clone the repository
```bash
git clone https://github.com/moawiah/synthetic_data_generator.git
cd synthetic_data_generator
```
### Install Dependencies
```bah
pip install -r app/requirements.txt
```
### Hugging Face Token
You need to create a `.env` file with your HuggingFace token like `HF_TOKEN=your-token-here`
### Run
run the app using
`python app/app.py`
## Example Output - 1 Job
```JSON
{
"title": "Software Engineer"
,
"description": "We are seeking a highly skilled software engineer to join our team and contribute to the development of innovative software solutions. The ideal candidate will have experience in designing, coding, and testing software systems, and will be able to work collaboratively with cross-functional teams. Responsibilities include writing clean, maintainable, and efficient code, as well as actively participating in code reviews and continuous integration processes. This is an excellent opportunity for a self-starter with a passion for technology and a desire to grow in their career."
,
"requirements":[
"0":"Bachelor's degree in Computer Science or related field",
"1":"Minimum of 2 years experience in software development",
"2":"Strong proficiency in Java or C++",
"3":"Experience with agile development methodologies",
"4":"Good understanding of data structures and algorithms",
"5":"Excellent problem-solving and analytical skills"
],
"location":"New York, NY",
"company_name":"ABC Technologies"
}
```
## Future Improvements
🔁 Add support for more job roles and industries
🧠 Model selector from UI
💾 Export dataset as CSV
☁️ Optional integration with LangChain or RAG workflows

View File

@@ -0,0 +1,156 @@
import os
import requests
from IPython.display import Markdown, display, update_display
from openai import OpenAI
from google.colab import drive
from huggingface_hub import login
from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig, pipeline, TextGenerationPipeline
import torch
from consts import FALCON, MISTRAL, Databricks
from dotenv import load_dotenv
import json
import ast
import gradio as gr
import re
# Sign in to HuggingFace Hub
load_dotenv()
hf_token = os.getenv("HF_TOKEN")
# Main Prompt
prompt = """
Generate one fake job posting for a {{role}}.
Return only a single JSON object with:
- title
- description (5-10 sentences)
- requirements (array of 4-6 strings)
- location
- company_name
No explanations, no extra text.
Only the JSON object.
"""
# Main Conf
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_quant_type="nf4"
)
def load_model_and_tokenizer():
tokenizer = AutoTokenizer.from_pretrained(MISTRAL, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
MISTRAL,
device_map={"": "cuda"},
trust_remote_code=True,
offload_folder="/tmp/dolly_offload",
quantization_config=bnb_config
)
return model, tokenizer
def generate_job(role="Software Engineer", model=None, tokenizer=None):
# prompt = prompt.format(role=role, n=n)
# outputs = generator(prompt, max_new_tokens=500, do_sample=True, temperature=0.9)
# return outputs[0]['generated_text']
# Apply chat template formatting
# inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)
inputs = tokenizer(prompt.format(role=role), return_tensors="pt")
inputs = {k: v.to(model.device) for k, v in inputs.items()}
# Generate output
outputs = model.generate(
**inputs,
max_new_tokens=600,
do_sample=True,
temperature=0.2,
top_p=0.9,
pad_token_id=tokenizer.eos_token_id
)
# Decode and return
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
return result
def generate_jobs(role="Software Engineer", n=5):
model, tokenizer = load_model_and_tokenizer()
role = "Software Engineer"
fake_jobs = []
for i in range(n):
fake_jobs.append(generate_job(role=role, model=model, tokenizer=tokenizer))
return fake_jobs
def extract_json_objects_from_text_block(texts):
"""
Accepts either a single string or a list of strings.
Extracts all valid JSON objects from messy text blocks.
"""
if isinstance(texts, str):
texts = [texts] # wrap in list if single string
pattern = r"\{[\s\S]*?\}"
results = []
for raw_text in texts:
matches = re.findall(pattern, raw_text)
for match in matches:
try:
obj = json.loads(match)
results.append(obj)
except json.JSONDecodeError:
continue
return results
def generate_ui(role, n):
try:
raw_jobs = generate_jobs(role, n)
parsed_jobs = extract_json_objects_from_text_block(raw_jobs)
if not isinstance(parsed_jobs, list) or not all(isinstance(item, dict) for item in parsed_jobs):
print("[ERROR] Parsed result is not a list of dicts")
return gr.update(value=[], visible=True), None
filename = f"data/{role.replace(' ', '_').lower()}_jobs.json"
with open(filename, "w") as f:
json.dump(parsed_jobs, f, indent=2)
print(f"[INFO] Returning {len(parsed_jobs)} jobs -> {filename}")
return parsed_jobs, filename
except Exception as e:
print(f"[FATAL ERROR] {e}")
return gr.update(value=[], visible=True), None
if __name__ == "__main__":
with gr.Blocks() as demo:
gr.Markdown("# 🧠 Synthetic Job Dataset Generator")
gr.Markdown("Generate a structured dataset of job postings for a specific role.")
with gr.Row():
role_input = gr.Textbox(label="Job Role", placeholder="e.g. Software Engineer", value="Software Engineer")
n_input = gr.Number(label="Number of Samples", value=5, precision=0)
generate_button = gr.Button("🚀 Generate")
output_table = gr.JSON(label="Generated Dataset")
download_button = gr.File(label="Download JSON")
generate_button.click(
generate_ui,
inputs=[role_input, n_input],
outputs=[output_table, download_button]
)
demo.launch(debug=True, share=True)

View File

@@ -0,0 +1,5 @@
# Models
GPT = 'gpt2'
FALCON = "tiiuae/falcon-rw-1b"
MISTRAL = "mistralai/Mistral-7B-Instruct-v0.1"
Databricks = "databricks/dolly-v2-3b"

View File

@@ -0,0 +1,7 @@
huggingface_hub==0.30.2
ipython==8.12.3
openai==1.76.2
protobuf==6.30.2
Requests==2.32.3
torch==2.6.0+cu124
transformers==4.51.3

View File

@@ -0,0 +1,71 @@
[
{
"title": "Software Engineer",
"description": "We are seeking a highly skilled software engineer to join our team in developing and maintaining complex software systems. The ideal candidate will have a strong background in computer science and experience with multiple programming languages. Responsibilities include writing clean and efficient code, collaborating with cross-functional teams, and actively participating in code reviews. This is an excellent opportunity for a self-starter with a passion for technology and a desire to grow in their career.",
"requirements": [
"Bachelor's degree in Computer Science or related field",
"3+ years of experience in software development",
"Strong proficiency in Java or C++",
"Experience with agile development methodologies",
"Excellent problem-solving and analytical skills"
],
"location": "New York, NY",
"company_name": "ABC Technologies"
},
{
"title": "Software Engineer",
"description": "We are looking for a highly skilled software engineer to join our team and contribute to the development of innovative software solutions. The ideal candidate will have experience in designing, developing, and testing software systems, and be able to work independently or as part of a team. Responsibilities include writing clean and efficient code, collaborating with cross-functional teams, and actively participating in code reviews. Must have a strong understanding of computer science principles and be able to learn quickly. This is a full-time position located in San Francisco, CA.",
"requirements": [
"Bachelor's degree in Computer Science or related field",
"3+ years of experience in software development",
"Strong proficiency in Java or C++",
"Experience with agile development methodologies",
"Excellent problem-solving skills",
"Ability to work in a fast-paced environment"
],
"location": "San Francisco, CA",
"company_name": "Acme Inc."
},
{
"title": "Software Engineer",
"description": "We are seeking a highly skilled software engineer to join our team in developing and maintaining our cutting-edge software applications. The ideal candidate will have a strong background in computer science and software engineering, with experience in designing, coding, and testing software systems. Responsibilities include collaborating with cross-functional teams, writing clean and efficient code, and ensuring the timely delivery of high-quality software products. This is an excellent opportunity for a self-starter with a passion for technology and a desire to work in a dynamic and fast-paced environment.",
"requirements": [
"Bachelor's degree in Computer Science or related field",
"3+ years of experience in software engineering",
"Strong proficiency in Java, Python, or C++",
"Experience with agile development methodologies",
"Excellent problem-solving and analytical skills",
"Strong communication and interpersonal skills"
],
"location": "New York, NY",
"company_name": "ABC Tech"
},
{
"title": "Software Engineer",
"description": "We are seeking a highly skilled software engineer to join our team and contribute to the development of innovative software solutions. The ideal candidate will have a strong background in computer science and experience with various programming languages and technologies. Responsibilities include designing, coding, testing, and maintaining software systems, as well as collaborating with cross-functional teams. This is an excellent opportunity for a creative and motivated individual to make a significant impact in the tech industry.",
"requirements": [
"Bachelor's degree in Computer Science or related field",
"Minimum of 2 years experience in software development",
"Strong proficiency in Java, Python, or C++",
"Experience with agile development methodologies",
"Excellent problem-solving and analytical skills",
"Ability to work independently and as part of a team",
"Strong communication and interpersonal skills"
],
"location": "New York, NY",
"company_name": "ABC Tech Inc."
},
{
"title": "Software Engineer",
"description": "We are looking for a skilled software engineer to join our team and contribute to the development of innovative software solutions. Responsibilities include designing, coding, testing and maintaining software systems, as well as collaborating with cross-functional teams. The ideal candidate will have a strong background in computer science or a related field, and at least 3 years of experience in software development. Must be proficient in multiple programming languages, including Java, Python, and C++. Strong problem-solving skills and the ability to work independently or as part of a team are required. This is a full-time position located in San Francisco, CA.",
"requirements": [
"Bachelor's degree in Computer Science or related field",
"At least 3 years of experience in software development",
"Proficiency in Java, Python, and C++",
"Strong problem-solving skills",
"Ability to work independently or as part of a team"
],
"location": "San Francisco, CA",
"company_name": "Innovative Solutions Inc."
}
]

File diff suppressed because one or more lines are too long