add README

rpeeters85 · rpeeters85 · commit 6e370c6ef8fe · 2025-06-05T11:47:45.000+02:00
diff --git a/README.md b/README.md
@@ -1 +1,37 @@
-# WebMall
+# WebMall - A Multi-Shop Benchmark for Evaluating Web Agents
+
+## Setting up WebMall
+
+### Environment
+- WebMall requires python 3.11/3.12
+- WebMall requires a python environment without installed versions of BrowserGym and AgentLab, as we provide edited versions of BrowserGym and AgentLab which need local installation (steps below).
+
+### Install local version of BrowserGym
+- As we use a fork of BrowserGym and AgentLab as submodules, they must be initialized first with  ```git submodule update --init --recursive```
+- Run ```make install``` in a terminal in the ```WebMall/BrowserGym``` folder to install BrowserGym and to install PlayWright to run experiments in a browser.
+
+### Install local version of AgentLab
+- Run ```pip install -e .``` in ```WebMall/AgentLab```
+
+### Setup AgentLab
+- Set the environment variables:
+  - export AGENTLAB_EXP_ROOT=<root directory of experiment results>  # defaults to $HOME/agentlab_results this is where the experiment results will be stored.
+  - export OPENAI_API_KEY=<your openai api key> # if openai models are used set the OPEN AI API Key.
+  - export ANTHROPIC_API_KEY=<your anthropic api key> # if anthropic models are used set the ANTHROPIC API Key.
+
+### Setup WebMall environment variabels
+- WebMall expects a file: WebMall/.env which contains env-variables setting the adresses to the shop websites. Make a copy of WebMall/.env.example and rename it to .env. Then set the variables SHOP1_URL, SHOP2_URL, SHOP3_URL, SHOP4_URL, FRONTEND_URL according to the shop adresses you want to use (if you use the local docker setup it uses localhost with ports 8081-8085, if the ports are available, the variables are correctly set).
+
+### Setup the WebMall-Shops-Websites locally with Docker
+1. The local docker setup requires docker-compose
+2. Run ```bash docker_all/restore_all_and_deploy_local.sh``` to download the relevant files, start the containers and host the shops locally.
+3. If you used the default ports, the setup is done, if not, you need to change the adresses inside the WooCommerce-Containers by running ```docker_all/fix_urls_deploy.sh``` for each of the 4 shops inside the respective docker-containers. 
+Example: ```docker exec WebMall_wordpress_shop1 /bin/bash -c "/usr/local/bin/fix_urls_deploy.sh 'http://localhost:8081' 'http://localhost:7733'```
+4. Verify the setup by visiting the Shop-Websites and the Submission page in your browser. 
+
+## Run WebMall-Benchmark
+- A WebMall benchmark run can be started with the script ```WebMall/run_webmall_study.py```,
+its results will be stored in the directory you set in AGENTLAB_EXP_ROOT. Set the task set you want to run by commenting in the relevant ```benchmark``` variable in the file.
+
+## Run a singular WebMall-task
+- A run for a single WebMall task can be started with the script ```WebMall/run_single_task.py```. Its results will be stored in the directory you set in AGENTLAB_EXP_ROOT.
diff --git a/run_webmall_study.py b/run_webmall_study.py
@@ -21,15 +21,10 @@
 
 logging.getLogger().setLevel(logging.DEBUG)
 
-#from agentlab.agents.webmall_generic_agent import AGENT_4o_VISION
-#from agentlab.agents.generic_agent import AGENT_4o_VISION
-
 from agentlab.agents import dynamic_prompting as dp
 
-#from agentlab.llm.eco_logits_llm_configs import CHAT_MODEL_ARGS_DICT
 from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
 
-#from agentlab.agents.webmall_generic_agent.generic_agent import GenericAgent, GenericPromptFlags, GenericAgentArgs
 from agentlab.agents.generic_agent.generic_agent import GenericAgent, GenericPromptFlags, GenericAgentArgs
 
 FLAGS_default = GenericPromptFlags(
@@ -87,13 +82,12 @@
 FLAGS_AX_M.use_memory = True
 FLAGS_AX_M.extra_instructions = 'Use your memory to note down important information like the URLs of potential solutions and corresponding pricing information.'
 
-FLAGS_AX_V_M = FLAGS_default.copy()
-FLAGS_AX_V_M.obs.use_screenshot = True
-FLAGS_AX_V_M.obs.use_som = True
-FLAGS_AX_V_M.use_memory = True
-FLAGS_AX_V_M.extra_instructions = 'Use your memory to note down important information like the URLs of potential solutions and corresponding pricing information.'
-
 AGENT_41_AX = GenericAgentArgs(
+    chat_model_args=CHAT_MODEL_ARGS_DICT["openai/gpt-4.1-2025-04-14"],
+    flags=FLAGS_AX,
+)
+
+AGENT_CLAUDE_AX = GenericAgentArgs(
     chat_model_args=CHAT_MODEL_ARGS_DICT["anthropic/claude-sonnet-4-20250514"],
     flags=FLAGS_AX,
 )
@@ -103,20 +97,29 @@
     flags=FLAGS_V,
 )
 
+AGENT_CLAUDE_V = GenericAgentArgs(
+    chat_model_args=CHAT_MODEL_ARGS_DICT["anthropic/claude-sonnet-4-20250514"],
+    flags=FLAGS_V,
+)
 
 AGENT_41_AX_V = GenericAgentArgs(
+    chat_model_args=CHAT_MODEL_ARGS_DICT["openai/gpt-4.1-2025-04-14"],
+    flags=FLAGS_AX_V,
+)
+
+AGENT_CLAUDE_AX_V = GenericAgentArgs(
     chat_model_args=CHAT_MODEL_ARGS_DICT["anthropic/claude-sonnet-4-20250514"],
     flags=FLAGS_AX_V,
 )
 
 AGENT_41_AX_M = GenericAgentArgs(
-    chat_model_args=CHAT_MODEL_ARGS_DICT["anthropic/claude-sonnet-4-20250514"],
+    chat_model_args=CHAT_MODEL_ARGS_DICT["openai/gpt-4.1-2025-04-14"],
     flags=FLAGS_AX_M,
 )
 
-AGENT_41_AX_V_M = GenericAgentArgs(
-    chat_model_args=CHAT_MODEL_ARGS_DICT["openai/gpt-4.1-2025-04-14"],
-    flags=FLAGS_AX_V_M,
+AGENT_CLAUDE_AX_M = GenericAgentArgs(
+    chat_model_args=CHAT_MODEL_ARGS_DICT["anthropic/claude-sonnet-4-20250514"],
+    flags=FLAGS_AX_M,
 )
 
 current_file = Path(__file__).resolve()
@@ -125,21 +128,12 @@
 
 
 # choose your agent or provide a new agent
-agent_args = [AGENT_41_AX_M]
+agent_args = [AGENT_41_AX]
 
 # ## select the benchmark to run on
-# benchmark = "webmall_a_c_d"
-# benchmark = "webmall_tiny"
-# benchmark = "webmall"
-# benchmark = "miniwob"
-# benchmark = "workarena_l1"
-# benchmark = "workarena_l2"
-# benchmark = "workarena_l3"
-# benchmark = "webarena"
-#benchmark = "webmall_basic_v0.7"
-benchmark = "webmall_advanced_v0.7"
-#benchmark = "webmall_tiny_v0.7"
-# benchmark = "webmall_j_v0.7"
+
+benchmark = "webmall_basic_v0.7"
+# benchmark = "webmall_advanced_v0.7"
 
 # Set reproducibility_mode = True for reproducibility
 # this will "ask" agents to be deterministic. Also, it will prevent you from launching if you have