diff --git a/experiments/dsp/README.md b/experiments/dsp/README.md
index 7a587df..6413bc6 100644
--- a/experiments/dsp/README.md
+++ b/experiments/dsp/README.md
@@ -13,7 +13,7 @@ lake build
 ```
 Then run `main.py`
 ``` sh
-python3 experiments/dsp/main.py -h
+python3 main.py -h
 ```
 
 The main command for running DSP is `eval`. Due to the multitude of data format
@@ -21,7 +21,19 @@ out there, use the `--format` flag to specify the data format. For example,
 running DSP on minif2f is:
 
 ``` sh
-python3 experiments/dsp/main.py eval --dataset ../minif2f/valid.jsonl --format minif2f
+python3 main.py eval \
+    --dataset ../minif2f/valid.jsonl \
+    --format minif2f \
+    --output results-minif2f-valid
+```
+
+Then, use `plot.py` to generate the plots
+
+``` sh
+python3 plot.py \
+    --result results-minif2f-{valid,test} \
+    --names valid test \
+    --plot-output output-plot
 ```
 
 ## Related work
diff --git a/experiments/dsp/main.py b/experiments/dsp/main.py
index 562fd66..8437b85 100644
--- a/experiments/dsp/main.py
+++ b/experiments/dsp/main.py
@@ -64,7 +64,7 @@ class OpenAI_DSP_Engine(Engine):
         print(f'{base_url=}') if verbose_init else None
 
 
-        if not ('gpt-4-' in model or 'gpt-3.5-' in model or 'gpt-4o' in model):
+        if not ('gpt-4-' in model or 'gpt-3.5-' in model or 'gpt-4o' in model or model == "o1-preview"):
             raise ValueError(f"Model {model=} not supported.")
         self.model = model
         self.api_key = api_key
@@ -82,6 +82,41 @@ class OpenAI_DSP_Engine(Engine):
         # Prove params
         # ...TODO not sure if needed right now...
 
+    @property
+    def role_prompt(self) -> str:
+        return "assistant" if self.model.startswith("o1") else "system"
+
+    def sample_draft(self, prompt: str):
+        extra = {} if self.model.startswith("o1") else dict(
+            temperature=self.draft_sampling_params.temperature,
+            top_p=self.draft_sampling_params.top_p,
+            stop=self.draft_sampling_params.stop[:3],
+        )
+        return self.llm.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": self.role_prompt, "content": self.draft_system_prompt},
+                {"role": "user", "content": prompt},
+            ],
+            n=self.draft_sampling_params.n,
+            **extra,
+        )
+    def sample_sketch(self, prompt: str):
+        extra = {} if self.model.startswith("o1") else dict(
+            temperature=self.sketch_sampling_params.temperature,
+            top_p=self.sketch_sampling_params.top_p,
+        )
+        return self.llm.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": self.role_prompt, "content": self.sketch_system_prompt},
+                {"role": "user", "content": prompt},
+            ],
+            n=self.sketch_sampling_params.n,
+            **extra,
+            # stop=eng.sketch_sampling_params.stop[:3],
+        )
+
 @retry(stop=stop_after_attempt(15), wait=wait_exponential(multiplier=2, max=128))
 def autoformalize_prob(
         eng: Engine,
@@ -106,17 +141,7 @@ def step_draft(
     prompt = eng.draft_prompt_template.replace('{nl_problem}', nl_problem)
     # Get all **completions** to single prompt, one (in) -> many (out)
     # ref: https://platform.openai.com/docs/api-reference/chat/object
-    response: Any = eng.llm.chat.completions.create(
-        model=eng.model,
-        messages=[
-            {"role": "system", "content": eng.draft_system_prompt},
-            {"role": "user", "content": prompt},
-        ],
-        temperature=eng.draft_sampling_params.temperature,
-        top_p=eng.draft_sampling_params.top_p,
-        n=eng.draft_sampling_params.n,
-        stop=eng.draft_sampling_params.stop[:3],
-        )
+    response: Any = eng.sample_draft(prompt)
     # Get all completions for single prompt
     completions: list[str] = [
         completion.message.content
@@ -149,17 +174,7 @@ def step_sketch(
         x_fl_problem = datum.fl_problem if datum.fl_problem else autoformalize_prob(eng, datum)
         prompt = eng.sketch_prompt_template.replace('{fl_problem}', x_nl_problem).replace('{fl_problem}', y_nl_solution)
     # Get all **completions** to single prompt, one (in) -> many (out), ref: https://platform.openai.com/docs/api-reference/chat/object
-    response: Any = eng.llm.chat.completions.create(
-        model=eng.model,
-        messages=[
-            {"role": "system", "content": eng.sketch_system_prompt},
-            {"role": "user", "content": prompt},
-        ],
-        temperature=eng.sketch_sampling_params.temperature,
-        top_p=eng.sketch_sampling_params.top_p,
-        n=eng.sketch_sampling_params.n,
-        # stop=eng.sketch_sampling_params.stop[:3],
-    )
+    response: Any = eng.sample_sketch(prompt)
     # Get all completions for single prompt
     completions: list[str] = [completion.message.content for completion in response.choices]  # response.choices[i].message
     sketches: list[str] = completions
@@ -455,7 +470,7 @@ if __name__ == "__main__":
         "--model",
         help="Model",
         default="gpt-4o",
-        choices=["gpt2", "gpt-3.5-turbo", "gpt-4o", "deepseek-ai/deepseek-math-7b-instruct"],
+        choices=["gpt2", "gpt-3.5-turbo", "gpt-4o", "deepseek-ai/deepseek-math-7b-instruct", "o1-preview"],
     )
     parser.add_argument(
         "--format",