Merge pull request #24 from lenianiva/experiments/minif2f

experiment: MiniF2F speedup
2024-10-13 19:21:16 -07:00 · 2024-10-13 19:21:16 -07:00 · 8585e3dd9e
parent 94604086ef 8196493258
commit 8585e3dd9e
3 changed files with 55 additions and 22 deletions
--- a/experiments/minif2f/README.md
+++ b/experiments/minif2f/README.md
@ -1,10 +1,17 @@
 # MiniF2F
 This is an experiment on running a LLM prover on miniF2F data. Build the project
-`MiniF2F` with `lake build`, and run with
+`MiniF2F` with `lake build`. Check the environment and data with
 ``` sh
 python3 experiments/minif2f/main.py check
 python3 experiments/minif2f/main.py list
 ```
 and run experiments with
 ```sh
-python3 experiments/minif2f/main.py [--dry-run] [--use-llm]
+python3 experiments/minif2f/main.py eval [--use-llm] [--use-hammer]
 ```
 Read the help message carefully.
--- a/experiments/minif2f/main.py
+++ b/experiments/minif2f/main.py
@ -29,10 +29,12 @@ def try_test_data(server, agent, entry: dict, max_steps: int, max_trials_per_goa
    goal_states = server.load_sorry(command)
-    if len(goal_states) == 0:
+    if len(goal_states) != 1:
        return None
    goal_state, = goal_states
    if isinstance(goal_state, list):
        return None
    try:
        return agent.search(
            server=server,
@ -103,16 +105,28 @@ def run_eval(args):
            with open(file_name, 'w') as f:
                json.dump({ 'id': datum['id'], 'success': result.success, 'steps': result.steps  }, f)
 def run_check(args):
    project_path, lean_path = get_project_and_lean_path()
    print(f"$PWD: {project_path}")
    print(f"$LEAN_PATH: {lean_path}")
    server = Server(
        imports=["Mathlib", "Aesop"],
        project_path=project_path,
        lean_path=lean_path,
        core_options=CORE_OPTIONS,
    )
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        prog='MiniF2F Search',
        description='Executes LLM on MiniF2F Search',
    )
    parser.add_argument('--use-hammer', action='store_true')
    parser.add_argument(
-        '--dry-run',
+        'mode',
-        action='store_true',
+        help='Function',
-        help="List the data used, but don't run")
+        choices=['list', 'eval', 'check'],
    )
    parser.add_argument('--use-hammer', action='store_true')
    parser.add_argument('--validation', action='store_true')
    parser.add_argument('--use-llm', action='store_true')
    parser.add_argument('--max-steps', default=50)
@ -120,7 +134,11 @@ if __name__ == '__main__':
    parser.add_argument('--feedback-turns', default=2)
    args = parser.parse_args()
-    if args.dry_run:
+    if args.mode == "list":
        dry_run(args)
-    else:
+    elif args.mode == "eval":
        run_eval(args)
    elif args.mode == "check":
        run_check(args)
    else:
        raise ValueError(f"Invalid mode: {args.mode}")
--- a/experiments/minif2f/model/llm_agent.py
+++ b/experiments/minif2f/model/llm_agent.py
@ -1,5 +1,6 @@
 from typing import Optional
 import collections, unittest
 from termcolor import colored
 from pantograph.search import Agent
 from pantograph.server import Server, TacticFailure, ServerError
 from pantograph.expr import Expr, Tactic, GoalState
@ -57,6 +58,7 @@ class LLMAgent(Agent):
            new_state = None
            for ii in range(self.n_trials):
                print(f"===============trail {str(ii)}============")
                try:
                    s = select_tactic.run(
                        server=self.server,
                        state=state,
@ -70,7 +72,13 @@ class LLMAgent(Agent):
                    print("\n-- new state --\n", new_state)
                    if tactic:
                        if not isinstance(tactic, Tactic):
                            print(colored("[Tactic] Failed:", "red"), tactic)
                            return None
                        return tactic
                except Exception as e:
                    print(colored(str(e), "red"))
                    return None
            return None
        else:
            self.goal_tactic_id_map[key] = i + 1