diff --git a/.env.example b/.env.example index 888a0fe..1e27917 100644 --- a/.env.example +++ b/.env.example @@ -36,3 +36,20 @@ LANGSMITH_PROJECT="Agentic Code Optimization" # ============================================================================ # Set to true to enable verbose logging # LANGCHAIN_DEBUG=true + +# ============================================================================ +# Benchmark Configuration (optional) +# ============================================================================ +# Command to run baseline/post benchmarks +# Example: +export BENCHMARK_CMD="cd DeathStarBench/socialNetwork && \ +docker compose build user-service && \ +docker compose up -d && \ +docker run --rm --platform linux/amd64 \ + --network socialnetwork_default \ + -e max_user_index=962 \ + -v /Users/peng397/Desktop/agentic-code-optimization/DeathStarBench/socialNetwork/wrk2/scripts/social-network/mixed-workload.lua:/tmp/sn_mixed.lua:ro \ + deathstarbench/wrk2-client \ + /usr/local/bin/wrk -D exp -t 12 -c 400 -d 30 -L \ + -s /tmp/sn_mixed.lua \ + http://nginx-thrift:8080/wrk2-api/home-timeline/read -R 200" \ No newline at end of file diff --git a/.semgrep/aspnet-endpoints.yml b/.semgrep/aspnet-endpoints.yml deleted file mode 100644 index 0e823a7..0000000 --- a/.semgrep/aspnet-endpoints.yml +++ /dev/null @@ -1,70 +0,0 @@ -rules: - - id: aspnet-http-get-endpoint - patterns: - - pattern: | - [HttpGet(...)] - $RETURN $METHOD(...) { ... } - message: "Found HTTP GET endpoint: $METHOD" - languages: [csharp] - severity: INFO - metadata: - category: endpoint - http_method: GET - - - id: aspnet-http-post-endpoint - patterns: - - pattern: | - [HttpPost(...)] - $RETURN $METHOD(...) { ... } - message: "Found HTTP POST endpoint: $METHOD" - languages: [csharp] - severity: INFO - metadata: - category: endpoint - http_method: POST - - - id: aspnet-http-put-endpoint - patterns: - - pattern: | - [HttpPut(...)] - $RETURN $METHOD(...) { ... } - message: "Found HTTP PUT endpoint: $METHOD" - languages: [csharp] - severity: INFO - metadata: - category: endpoint - http_method: PUT - - - id: aspnet-http-delete-endpoint - patterns: - - pattern: | - [HttpDelete(...)] - $RETURN $METHOD(...) { ... } - message: "Found HTTP DELETE endpoint: $METHOD" - languages: [csharp] - severity: INFO - metadata: - category: endpoint - http_method: DELETE - - - id: aspnet-route-attribute - patterns: - - pattern: | - [Route($ROUTE)] - class $CONTROLLER : ControllerBase { ... } - message: "Found API controller: $CONTROLLER with route $ROUTE" - languages: [csharp] - severity: INFO - metadata: - category: controller - - - id: aspnet-api-controller - patterns: - - pattern: | - [ApiController] - class $NAME : ControllerBase { ... } - message: "Found API controller: $NAME" - languages: [csharp] - severity: INFO - metadata: - category: controller diff --git a/.semgrep/cache-queue-grpc.yml b/.semgrep/cache-queue-grpc.yml deleted file mode 100644 index dc5fe89..0000000 --- a/.semgrep/cache-queue-grpc.yml +++ /dev/null @@ -1,228 +0,0 @@ -rules: - # Redis/Cache Clients - - id: cache-java-jedis - pattern-either: - - pattern: new Jedis(...) - - pattern: new JedisPool(...) - - pattern: $CLIENT.get(...) - - pattern: $CLIENT.set(...) - message: "Redis client usage via Jedis" - languages: [java] - severity: INFO - metadata: - category: cache - library: Jedis - type: redis - - - id: cache-java-lettuce - pattern-either: - - pattern: RedisClient.create(...) - - pattern: $CLIENT.connect() - message: "Redis client usage via Lettuce" - languages: [java] - severity: INFO - metadata: - category: cache - library: Lettuce - type: redis - - - id: cache-csharp-stackexchange - pattern-either: - - pattern: ConnectionMultiplexer.Connect(...) - - pattern: $CONN.GetDatabase(...) - - pattern: $DB.StringGet(...) - - pattern: $DB.StringSet(...) - message: "Redis client usage via StackExchange.Redis" - languages: [csharp] - severity: INFO - metadata: - category: cache - library: StackExchange.Redis - type: redis - - - id: cache-go-redis - pattern-either: - - pattern: redis.NewClient(...) - - pattern: $CLIENT.Get(...) - - pattern: $CLIENT.Set(...) - message: "Redis client usage via go-redis" - languages: [go] - severity: INFO - metadata: - category: cache - library: go-redis - type: redis - - - id: cache-python-redis - pattern-either: - - pattern: redis.Redis(...) - - pattern: redis.StrictRedis(...) - - pattern: $CLIENT.get(...) - - pattern: $CLIENT.set(...) - message: "Redis client usage via redis-py" - languages: [python] - severity: INFO - metadata: - category: cache - library: redis-py - type: redis - - - id: cache-ts-ioredis - pattern-either: - - pattern: new Redis(...) - - pattern: $CLIENT.get(...) - - pattern: $CLIENT.set(...) - message: "Redis client usage via ioredis" - languages: [typescript, javascript] - severity: INFO - metadata: - category: cache - library: ioredis - type: redis - - # Message Queue Clients - Kafka - - id: queue-java-kafka-producer - pattern-either: - - pattern: new KafkaProducer(...) - - pattern: $PRODUCER.send(...) - message: "Kafka producer usage" - languages: [java] - severity: INFO - metadata: - category: queue - library: kafka-clients - type: kafka - direction: producer - - - id: queue-java-kafka-consumer - pattern-either: - - pattern: new KafkaConsumer(...) - - pattern: $CONSUMER.subscribe(...) - - pattern: $CONSUMER.poll(...) - message: "Kafka consumer usage" - languages: [java] - severity: INFO - metadata: - category: queue - library: kafka-clients - type: kafka - direction: consumer - - - id: queue-go-kafka - pattern-either: - - pattern: kafka.NewWriter(...) - - pattern: kafka.NewReader(...) - - pattern: $WRITER.WriteMessages(...) - - pattern: $READER.ReadMessage(...) - message: "Kafka usage via kafka-go" - languages: [go] - severity: INFO - metadata: - category: queue - library: kafka-go - type: kafka - - - id: queue-python-kafka - pattern-either: - - pattern: KafkaProducer(...) - - pattern: KafkaConsumer(...) - - pattern: $PRODUCER.send(...) - message: "Kafka usage via kafka-python" - languages: [python] - severity: INFO - metadata: - category: queue - library: kafka-python - type: kafka - - # Message Queue Clients - RabbitMQ - - id: queue-java-rabbitmq - pattern-either: - - pattern: new ConnectionFactory() - - pattern: $FACTORY.newConnection() - - pattern: $CONN.createChannel() - - pattern: $CHANNEL.basicPublish(...) - - pattern: $CHANNEL.basicConsume(...) - message: "RabbitMQ usage via amqp-client" - languages: [java] - severity: INFO - metadata: - category: queue - library: amqp-client - type: rabbitmq - - - id: queue-csharp-rabbitmq - pattern-either: - - pattern: new ConnectionFactory() - - pattern: $FACTORY.CreateConnection() - - pattern: $CONN.CreateModel() - - pattern: $CHANNEL.BasicPublish(...) - - pattern: $CHANNEL.BasicConsume(...) - message: "RabbitMQ usage via RabbitMQ.Client" - languages: [csharp] - severity: INFO - metadata: - category: queue - library: RabbitMQ.Client - type: rabbitmq - - - id: queue-python-pika - pattern-either: - - pattern: pika.BlockingConnection(...) - - pattern: $CONN.channel() - - pattern: $CHANNEL.basic_publish(...) - - pattern: $CHANNEL.basic_consume(...) - message: "RabbitMQ usage via pika" - languages: [python] - severity: INFO - metadata: - category: queue - library: pika - type: rabbitmq - - # gRPC Clients - - id: grpc-java-client - pattern-either: - - pattern: ManagedChannelBuilder.forAddress(...) - - pattern: $STUB.newBlockingStub(...) - - pattern: $STUB.newStub(...) - message: "gRPC client usage" - languages: [java] - severity: INFO - metadata: - category: grpc - library: grpc-java - - - id: grpc-csharp-client - pattern-either: - - pattern: new Channel(...) - - pattern: new GrpcChannel(...) - - pattern: new $SERVICE.$ SERVICEClient(...) - message: "gRPC client usage" - languages: [csharp] - severity: INFO - metadata: - category: grpc - library: Grpc.Net.Client - - - id: grpc-go-client - pattern-either: - - pattern: grpc.Dial(...) - - pattern: grpc.DialContext(...) - message: "gRPC client usage" - languages: [go] - severity: INFO - metadata: - category: grpc - library: grpc-go - - - id: grpc-python-client - pattern-either: - - pattern: grpc.insecure_channel(...) - - pattern: grpc.secure_channel(...) - message: "gRPC client usage" - languages: [python] - severity: INFO - metadata: - category: grpc - library: grpcio diff --git a/.semgrep/cpp-services.yml b/.semgrep/cpp-services.yml deleted file mode 100644 index 4b45e36..0000000 --- a/.semgrep/cpp-services.yml +++ /dev/null @@ -1,112 +0,0 @@ -rules: - # Apache Thrift - - id: cpp-grpc-thrift-client - pattern-either: - - pattern: new $CLIENT(...) - - pattern: apache::thrift::protocol::TProtocol - - pattern: apache::thrift::transport::TSocket - - pattern: ThriftClient<$SERVICE> - - pattern: ClientPool> - - pattern: $POOL->Pop() - message: "Apache Thrift client usage" - languages: [cpp, c, c++] - severity: INFO - metadata: - category: grpc - library: thrift - type: rpc - - - id: cpp-service-thrift-server - pattern-either: - - pattern: new apache::thrift::server::TThreadedServer(...) - - pattern: new apache::thrift::server::TNonblockingServer(...) - - pattern: new apache::thrift::server::TSimpleServer(...) - - pattern: new TThreadedServer(...) - - pattern: new TNonblockingServer(...) - # Stack allocation patterns (common in C++) - - pattern: TThreadedServer $SERVER(...); - - pattern: TNonblockingServer $SERVER(...); - - pattern: TSimpleServer $SERVER(...); - - pattern: apache::thrift::server::TThreadedServer $SERVER(...) - # Fallback regex for robust detection - - pattern-regex: (TThreadedServer|TNonblockingServer|TSimpleServer)\s - message: "Apache Thrift server definition" - languages: [cpp, c, c++] - severity: INFO - metadata: - category: service - library: thrift - type: rpc - - # Redis - - id: cpp-cache-redis - pattern-either: - - pattern: RedisClient(...) - - pattern: redisConnect(...) - - pattern: redisCommand(...) - - pattern: cpp_redis::client - - pattern: $CLIENT->zadd(...) - - pattern: $CLIENT->zrevrange(...) - - pattern: $CLIENT->get(...) - - pattern: $CLIENT->set(...) - - pattern: $CLIENT->hget(...) - - pattern: $CLIENT->hset(...) - - pattern: sw::redis::Redis - message: "Redis client usage" - languages: [cpp, c, c++] - severity: INFO - metadata: - category: cache - library: hiredis - type: redis - - # Memcached - - id: cpp-cache-memcached - pattern-either: - - pattern: memcached_create(...) - - pattern: memcached_server_push(...) - - pattern: memcached_set(...) - - pattern: memcached_get(...) - message: "Memcached client usage" - languages: [cpp, c, c++] - severity: INFO - metadata: - category: cache - library: libmemcached - type: memcached - - # MongoDB - - id: cpp-database-mongo - pattern-either: - - pattern: mongocxx::client(...) - - pattern: mongocxx::instance(...) - - pattern: $COLL.find(...) - - pattern: $COLL.insert_one(...) - # C Driver patterns found in socialNetwork - - pattern: mongoc_client_pool_pop(...) - - pattern: mongoc_client_get_collection(...) - - pattern: mongoc_collection_find(...) - - pattern: mongoc_collection_find_with_opts(...) - - pattern: mongoc_collection_find_and_modify(...) - message: "MongoDB client usage" - languages: [cpp, c, c++] - severity: INFO - metadata: - category: database - library: mongoc - type: mongodb - - # AMQP / RabbitMQ - - id: cpp-queue-amqp - pattern-either: - - pattern: AmqpClient::Channel(...) - - pattern: AmqpClient::Channel::Create(...) - - pattern: $CHAN->BasicPublish(...) - - pattern: $CHAN->BasicConsume(...) - message: "AMQP client usage" - languages: [cpp, c, c++] - severity: INFO - metadata: - category: queue - library: SimpleAmqpClient - type: rabbitmq diff --git a/.semgrep/csharp-clients.yml b/.semgrep/csharp-clients.yml deleted file mode 100644 index 6409b32..0000000 --- a/.semgrep/csharp-clients.yml +++ /dev/null @@ -1,55 +0,0 @@ -rules: - - id: csharp-httpclient-usage - patterns: - - pattern-either: - - pattern: | - new HttpClient() - - pattern: | - IHttpClientFactory $FACTORY - - pattern: | - CreateClient($NAME) - message: "Found HttpClient usage" - languages: [csharp] - severity: INFO - metadata: - category: http-client - - - id: csharp-redis-cache - patterns: - - pattern-either: - - pattern: | - IDistributedCache $CACHE - - pattern: | - AddStackExchangeRedisCache(...) - - pattern: | - ConnectionMultiplexer.Connect($CONN) - message: "Found Redis cache usage" - languages: [csharp] - severity: INFO - metadata: - category: cache - type: redis - - - id: csharp-rabbitmq-client - patterns: - - pattern-either: - - pattern: | - new ConnectionFactory() - - pattern: | - IModel $CHANNEL - message: "Found RabbitMQ usage" - languages: [csharp] - severity: INFO - metadata: - category: queue - type: rabbitmq - - - id: csharp-grpc-client - patterns: - - pattern: | - $CLIENT.CallInvoker - message: "Found gRPC client usage" - languages: [csharp] - severity: INFO - metadata: - category: grpc diff --git a/.semgrep/database-clients.yml b/.semgrep/database-clients.yml deleted file mode 100644 index 25a807b..0000000 --- a/.semgrep/database-clients.yml +++ /dev/null @@ -1,175 +0,0 @@ -rules: - # Java Database Clients - - id: database-java-jdbc - pattern-either: - - pattern: DriverManager.getConnection(...) - - pattern: DataSource.getConnection(...) - - pattern: $CONN.createStatement() - - pattern: $CONN.prepareStatement(...) - message: "Database connection using JDBC" - languages: [java] - severity: INFO - metadata: - category: database - library: JDBC - - - id: database-java-jpa - pattern-either: - - pattern: | - @Entity - class $CLASS { ... } - - pattern: EntityManager.createQuery(...) - - pattern: $EM.persist(...) - - pattern: $EM.find(...) - message: "Database usage via JPA/Hibernate" - languages: [java] - severity: INFO - metadata: - category: database - library: JPA - - # C# Database Clients - - id: database-csharp-ef - pattern-either: - - pattern: | - class $CLASS : DbContext { ... } - - pattern: new DbContext(...) - - pattern: $CTX.Set<$TYPE>() - - pattern: $CTX.SaveChanges(...) - - pattern: $CTX.SaveChangesAsync(...) - message: "Database usage via Entity Framework" - languages: [csharp] - severity: INFO - metadata: - category: database - library: EntityFramework - - - id: database-csharp-sqlclient - pattern-either: - - pattern: new SqlConnection(...) - - pattern: new SqlCommand(...) - - pattern: $CMD.ExecuteReader() - - pattern: $CMD.ExecuteNonQuery() - message: "Database connection using ADO.NET SqlClient" - languages: [csharp] - severity: INFO - metadata: - category: database - library: SqlClient - - - id: database-csharp-npgsql - pattern-either: - - pattern: new NpgsqlConnection(...) - - pattern: new NpgsqlCommand(...) - message: "PostgreSQL connection using Npgsql" - languages: [csharp] - severity: INFO - metadata: - category: database - library: Npgsql - - # Go Database Clients - - id: database-go-sql - pattern-either: - - pattern: sql.Open(...) - - pattern: $DB.Query(...) - - pattern: $DB.Exec(...) - - pattern: $DB.QueryRow(...) - message: "Database connection using database/sql" - languages: [go] - severity: INFO - metadata: - category: database - library: database/sql - - - id: database-go-gorm - pattern-either: - - pattern: gorm.Open(...) - - pattern: $DB.Create(...) - - pattern: $DB.Find(...) - - pattern: $DB.Where(...) - message: "Database usage via GORM" - languages: [go] - severity: INFO - metadata: - category: database - library: GORM - - # Python Database Clients - - id: database-python-sqlalchemy - pattern-either: - - pattern: create_engine(...) - - pattern: sessionmaker(...) - - pattern: $SESSION.query(...) - - pattern: $SESSION.add(...) - - pattern: $SESSION.commit() - message: "Database usage via SQLAlchemy" - languages: [python] - severity: INFO - metadata: - category: database - library: SQLAlchemy - - - id: database-python-psycopg - pattern-either: - - pattern: psycopg2.connect(...) - - pattern: $CONN.cursor() - - pattern: $CURSOR.execute(...) - message: "PostgreSQL connection using psycopg2" - languages: [python] - severity: INFO - metadata: - category: database - library: psycopg2 - - - id: database-python-pymongo - pattern-either: - - pattern: MongoClient(...) - - pattern: $CLIENT[$DB][$COLLECTION] - - pattern: $COLLECTION.find(...) - - pattern: $COLLECTION.insert_one(...) - message: "MongoDB connection using pymongo" - languages: [python] - severity: INFO - metadata: - category: database - library: pymongo - - # TypeScript/JavaScript Database Clients - - id: database-ts-typeorm - pattern-either: - - pattern: createConnection(...) - - pattern: getRepository(...) - - pattern: $REPO.find(...) - - pattern: $REPO.save(...) - message: "Database usage via TypeORM" - languages: [typescript, javascript] - severity: INFO - metadata: - category: database - library: TypeORM - - - id: database-ts-mongoose - pattern-either: - - pattern: mongoose.connect(...) - - pattern: mongoose.model(...) - - pattern: $MODEL.find(...) - - pattern: $MODEL.create(...) - message: "MongoDB connection using Mongoose" - languages: [typescript, javascript] - severity: INFO - metadata: - category: database - library: Mongoose - - - id: database-ts-prisma - pattern-either: - - pattern: new PrismaClient() - - pattern: $PRISMA.$ENTITY.findMany(...) - - pattern: $PRISMA.$ENTITY.create(...) - message: "Database usage via Prisma" - languages: [typescript, javascript] - severity: INFO - metadata: - category: database - library: Prisma diff --git a/.semgrep/entityframework.yml b/.semgrep/entityframework.yml deleted file mode 100644 index b8b5535..0000000 --- a/.semgrep/entityframework.yml +++ /dev/null @@ -1,49 +0,0 @@ -rules: - - id: ef-dbcontext-class - patterns: - - pattern: | - class $NAME : DbContext { ... } - message: "Found Entity Framework DbContext: $NAME" - languages: [csharp] - severity: INFO - metadata: - category: database - orm: entity-framework - - - id: ef-dbset-property - patterns: - - pattern: | - public DbSet<$ENTITY> $PROP { get; set; } - message: "Found Entity Framework entity: $ENTITY" - languages: [csharp] - severity: INFO - metadata: - category: entity - orm: entity-framework - - - id: ef-connection-string - patterns: - - pattern-either: - - pattern: | - UseSqlServer($CONN) - - pattern: | - UseNpgsql($CONN) - - pattern: | - UseMySql($CONN) - - pattern: | - UseSqlite($CONN) - message: "Found database connection configuration" - languages: [csharp] - severity: INFO - metadata: - category: database-config - - - id: ef-migration - patterns: - - pattern: | - public partial class $NAME : Migration { ... } - message: "Found Entity Framework migration: $NAME" - languages: [csharp] - severity: INFO - metadata: - category: migration diff --git a/.semgrep/http-clients.yml b/.semgrep/http-clients.yml deleted file mode 100644 index 9a6e27b..0000000 --- a/.semgrep/http-clients.yml +++ /dev/null @@ -1,173 +0,0 @@ -rules: - # Java HTTP Clients - - id: http-client-java-resttemplate - pattern-either: - - pattern: new RestTemplate() - - pattern: $CLIENT.exchange(...) - - pattern: $CLIENT.getForObject(...) - - pattern: $CLIENT.postForObject(...) - - pattern: $CLIENT.put(...) - - pattern: $CLIENT.delete(...) - message: "HTTP client call using Spring RestTemplate" - languages: [java] - severity: INFO - metadata: - category: http-client - library: RestTemplate - - - id: http-client-java-webclient - pattern-either: - - pattern: WebClient.create(...) - - pattern: $CLIENT.get().uri(...) - - pattern: $CLIENT.post().uri(...) - - pattern: $CLIENT.put().uri(...) - - pattern: $CLIENT.delete().uri(...) - message: "HTTP client call using Spring WebClient" - languages: [java] - severity: INFO - metadata: - category: http-client - library: WebClient - - - id: http-client-java-okhttp - pattern-either: - - pattern: new OkHttpClient() - - pattern: $CLIENT.newCall(...) - - pattern: new Request.Builder() - message: "HTTP client call using OkHttp" - languages: [java] - severity: INFO - metadata: - category: http-client - library: OkHttp - - - id: http-client-java-apache - pattern-either: - - pattern: HttpClients.createDefault() - - pattern: $CLIENT.execute(...) - message: "HTTP client call using Apache HttpClient" - languages: [java] - severity: INFO - metadata: - category: http-client - library: ApacheHttpClient - - # C# HTTP Clients - - id: http-client-csharp-httpclient - pattern-either: - - pattern: new HttpClient() - - pattern: $CLIENT.GetAsync(...) - - pattern: $CLIENT.PostAsync(...) - - pattern: $CLIENT.PutAsync(...) - - pattern: $CLIENT.DeleteAsync(...) - - pattern: $CLIENT.SendAsync(...) - message: "HTTP client call using HttpClient" - languages: [csharp] - severity: INFO - metadata: - category: http-client - library: HttpClient - - - id: http-client-csharp-restsharp - pattern-either: - - pattern: new RestClient(...) - - pattern: $CLIENT.Execute(...) - - pattern: $CLIENT.ExecuteAsync(...) - message: "HTTP client call using RestSharp" - languages: [csharp] - severity: INFO - metadata: - category: http-client - library: RestSharp - - # Go HTTP Clients - - id: http-client-go-http - pattern-either: - - pattern: http.Get(...) - - pattern: http.Post(...) - - pattern: http.NewRequest(...) - - pattern: $CLIENT.Do(...) - message: "HTTP client call using net/http" - languages: [go] - severity: INFO - metadata: - category: http-client - library: net/http - - - id: http-client-go-resty - pattern-either: - - pattern: resty.New() - - pattern: $CLIENT.R().Get(...) - - pattern: $CLIENT.R().Post(...) - message: "HTTP client call using Resty" - languages: [go] - severity: INFO - metadata: - category: http-client - library: resty - - # Python HTTP Clients - - id: http-client-python-requests - pattern-either: - - pattern: requests.get(...) - - pattern: requests.post(...) - - pattern: requests.put(...) - - pattern: requests.delete(...) - - pattern: requests.request(...) - message: "HTTP client call using requests" - languages: [python] - severity: INFO - metadata: - category: http-client - library: requests - - - id: http-client-python-httpx - pattern-either: - - pattern: httpx.get(...) - - pattern: httpx.post(...) - - pattern: httpx.Client() - - pattern: httpx.AsyncClient() - message: "HTTP client call using httpx" - languages: [python] - severity: INFO - metadata: - category: http-client - library: httpx - - - id: http-client-python-aiohttp - pattern-either: - - pattern: aiohttp.ClientSession() - - pattern: $SESSION.get(...) - - pattern: $SESSION.post(...) - message: "HTTP client call using aiohttp" - languages: [python] - severity: INFO - metadata: - category: http-client - library: aiohttp - - # TypeScript/JavaScript HTTP Clients - - id: http-client-ts-axios - pattern-either: - - pattern: axios.get(...) - - pattern: axios.post(...) - - pattern: axios.put(...) - - pattern: axios.delete(...) - - pattern: axios.request(...) - message: "HTTP client call using axios" - languages: [typescript, javascript] - severity: INFO - metadata: - category: http-client - library: axios - - - id: http-client-ts-fetch - pattern-either: - - pattern: fetch(...) - - pattern: window.fetch(...) - message: "HTTP client call using fetch API" - languages: [typescript, javascript] - severity: INFO - metadata: - category: http-client - library: fetch diff --git a/.semgrep/jakarta-ee.yml b/.semgrep/jakarta-ee.yml deleted file mode 100644 index 17fdadf..0000000 --- a/.semgrep/jakarta-ee.yml +++ /dev/null @@ -1,33 +0,0 @@ -rules: - - id: jakarta-persistence-database - patterns: - - pattern-either: - - pattern: EntityManager $EM = ...; - - pattern: private EntityManager $EM; - - pattern-inside: | - import jakarta.persistence.EntityManager; - ... - message: "Jakarta Persistence EntityManager detected" - languages: [java] - severity: INFO - metadata: - category: database - technology: jakarta-persistence - type: SQL - confidence: HIGH - - - id: jakarta-jaxrs-http-client - patterns: - - pattern-either: - - pattern: WebTarget $T = ...; - - pattern: $CLIENT.target(...) - - pattern-inside: | - import jakarta.ws.rs.client.$X; - ... - message: "Jakarta JAX-RS Client detected" - languages: [java] - severity: INFO - metadata: - category: http_client - technology: jakarta-jaxrs - confidence: MEDIUM diff --git a/evaluate.py b/evaluate.py index aadfb2a..8ba896f 100644 --- a/evaluate.py +++ b/evaluate.py @@ -14,6 +14,7 @@ import argparse import asyncio +import json import logging import sys import time @@ -28,8 +29,8 @@ # Import all agents from agents import (AnalyzerAgent, BaseAgent, BehaviorSummarizerAgent, - ComponentSummarizerAgent, EnvironmentSummarizerAgent, - OptimizerAgent) + ComponentSummarizerAgent, + EnvironmentSummarizerAgent, OptimizerAgent) from utils import RunManager # Import all workflows from workflows import orchestrate_complete_pipeline, orchestrate_summarizers @@ -181,7 +182,14 @@ async def evaluate_workflow(workflow_func: Callable, repo_path: str) -> None: try: logger.info(f"Starting workflow execution for repository: {repo_path_obj.absolute()}") start_time = time.time() - result = await workflow_func(str(repo_path_obj.absolute())) + if workflow_func.__name__ == "orchestrate_complete_pipeline": + benchmark_dir = run_dir / "benchmark" + result = await workflow_func( + str(repo_path_obj.absolute()), + benchmark_output_dir=str(benchmark_dir), + ) + else: + result = await workflow_func(str(repo_path_obj.absolute())) execution_time = time.time() - start_time logger.info(f"Workflow execution completed successfully") logger.info(f"RESULT:\n{result}") diff --git a/requirements.txt b/requirements.txt index f2c458d..5874b04 100644 --- a/requirements.txt +++ b/requirements.txt @@ -34,3 +34,6 @@ beautilog>=0.2.4 # Utilities python-json-logger>=2.0.0 colorama>=0.4.6 + +# Benchmark Visualization +matplotlib>=3.8.0 diff --git a/tools/__init__.py b/tools/__init__.py index ae59104..ed9d742 100644 --- a/tools/__init__.py +++ b/tools/__init__.py @@ -1,7 +1,8 @@ """Tools for agents.""" from .analysis import (build_analysis_bundle, read_code_snippet, read_file, - search_codebase) + search_codebase) +from .benchmark import run_benchmark_command from .behavior import (analyze_code_complexity, analyze_code_structure, analyze_data_dependencies, analyze_error_handling_strategy, diff --git a/tools/benchmark.py b/tools/benchmark.py new file mode 100644 index 0000000..6b20171 --- /dev/null +++ b/tools/benchmark.py @@ -0,0 +1,528 @@ +"""Tools for running external benchmark commands.""" + +from __future__ import annotations + +import asyncio +import json +import os +import re +import shlex +import subprocess +import time +from pathlib import Path +from typing import Any, Dict, List, Optional, Sequence + +from langchain_core.tools import tool + + +def _normalize_command(command: str | Sequence[str]) -> List[str]: + if isinstance(command, str): + return shlex.split(command) + return list(command) + +_TIME_UNITS_MS = { + "us": 0.001, + "µs": 0.001, + "ms": 1.0, + "s": 1000.0, +} + +_DATA_UNITS_MB = { + "KB": 1.0 / 1024.0, + "MB": 1.0, + "GB": 1024.0, +} + + +def _to_ms(value: float, unit: str) -> Optional[float]: + factor = _TIME_UNITS_MS.get(unit) + if factor is None: + return None + return value * factor + + +def _to_mb(value: float, unit: str) -> Optional[float]: + factor = _DATA_UNITS_MB.get(unit) + if factor is None: + return None + return value * factor + + +def parse_wrk_output(output: str) -> Dict[str, Any]: + """Parse wrk/wrk2 output and extract summary metrics.""" + metrics: Dict[str, Any] = { + "latency_ms": {}, + "percentiles_ms": {}, + "socket_errors": {}, + } + + latency_line = re.search( + r"Latency\s+([\d\.]+)([a-zµ]+)\s+([\d\.]+)([a-zµ]+)\s+([\d\.]+)([a-zµ]+)", + output, + re.IGNORECASE, + ) + if latency_line: + avg = _to_ms(float(latency_line.group(1)), latency_line.group(2)) + stdev = _to_ms(float(latency_line.group(3)), latency_line.group(4)) + p99 = _to_ms(float(latency_line.group(5)), latency_line.group(6)) + if avg is not None: + metrics["latency_ms"]["avg"] = avg + if stdev is not None: + metrics["latency_ms"]["stdev"] = stdev + if p99 is not None: + metrics["latency_ms"]["p99"] = p99 + + for line in output.splitlines(): + match = re.match(r"^\s*([\d\.]+)%\s+([\d\.]+)([a-zµ]+)\s*$", line) + if not match: + continue + percentile = match.group(1) + value = _to_ms(float(match.group(2)), match.group(3)) + if value is not None: + metrics["percentiles_ms"][percentile] = value + + rps_match = re.search(r"Requests/sec:\s*([\d\.]+)", output) + if rps_match: + metrics["requests_per_sec"] = float(rps_match.group(1)) + + transfer_match = re.search(r"Transfer/sec:\s*([\d\.]+)\s*([KMG]B)", output) + if transfer_match: + transfer_mb = _to_mb(float(transfer_match.group(1)), transfer_match.group(2)) + if transfer_mb is not None: + metrics["transfer_per_sec_mb"] = transfer_mb + + socket_match = re.search( + r"Socket errors:\s*connect\s+(\d+),\s*read\s+(\d+),\s*write\s+(\d+),\s*timeout\s+(\d+)", + output, + ) + if socket_match: + metrics["socket_errors"] = { + "connect": int(socket_match.group(1)), + "read": int(socket_match.group(2)), + "write": int(socket_match.group(3)), + "timeout": int(socket_match.group(4)), + } + + summary_match = re.search( + r"(\d+)\s+requests in\s+([\d\.]+)([smh]),\s+([\d\.]+)([KMG]B)\s+read", + output, + ) + if summary_match: + metrics["total_requests"] = int(summary_match.group(1)) + duration = float(summary_match.group(2)) + duration_unit = summary_match.group(3) + if duration_unit == "s": + metrics["duration_seconds"] = duration + elif duration_unit == "m": + metrics["duration_seconds"] = duration * 60.0 + elif duration_unit == "h": + metrics["duration_seconds"] = duration * 3600.0 + + data_mb = _to_mb(float(summary_match.group(4)), summary_match.group(5)) + if data_mb is not None: + metrics["data_read_mb"] = data_mb + + return metrics + + +def parse_jmeter_output(output: str) -> Dict[str, Any]: + """Parse JMeter CLI output and extract summary metrics.""" + metrics: Dict[str, Any] = { + "latency_ms": {}, + "percentiles_ms": {}, + "requests_per_sec": 0.0, + "error_rate": 0.0, + } + + # Example line: + # summary = 1809335 in 00:14:41 = 2054.3/s Avg: 4 Min: 0 Max: 15003 Err: 1780589 (98.41%) + # summary + 55747 in 00:00:10 = 5653.3/s Avg: 1 Min: 0 Max: 202 Err: 55747 (100.00%) Active: 10 Started: 10 Finished: 0 + matches = re.findall( + r"summary \s*[+=]\s+(\d+)\s+in\s+([\d:]+)\s+=\s+([\d\.]+)/s\s+Avg:\s+(\d+)\s+Min:\s+(\d+)\s+Max:\s+(\d+)\s+Err:\s+(\d+)\s+\(([\d\.]+)%\)", + output, + ) + if matches: + # Take the last match (most recent summary) + last_match = matches[-1] + metrics["total_requests"] = int(last_match[0]) + duration_str = last_match[1] + # simplistic duration parsing hh:mm:ss + parts = list(map(int, duration_str.split(":"))) + duration_seconds = 0.0 + if len(parts) == 3: + duration_seconds = parts[0] * 3600 + parts[1] * 60 + parts[2] + elif len(parts) == 2: + duration_seconds = parts[0] * 60 + parts[1] + metrics["duration_seconds"] = duration_seconds + + metrics["requests_per_sec"] = float(last_match[2]) + metrics["latency_ms"]["avg"] = float(last_match[3]) + metrics["latency_ms"]["min"] = float(last_match[4]) + metrics["latency_ms"]["max"] = float(last_match[5]) + + err_count = int(last_match[6]) + err_pct = float(last_match[7]) + metrics["error_count"] = err_count + metrics["error_rate"] = err_pct + + return metrics + + +def compare_benchmark_metrics(before: Dict[str, Any], after: Dict[str, Any]) -> Dict[str, Any]: + """Compare benchmark metrics before vs after.""" + def delta(before_val: Optional[float], after_val: Optional[float]) -> Dict[str, Any]: + if before_val is None or after_val is None: + return {"before": before_val, "after": after_val, "delta": None, "pct_change": None} + diff = after_val - before_val + pct = (diff / before_val * 100.0) if before_val != 0 else None + return { + "before": before_val, + "after": after_val, + "delta": diff, + "pct_change": pct, + } + + before_metrics = before.get("metrics", {}) + after_metrics = after.get("metrics", {}) + + comparison = { + "requests_per_sec": delta(before_metrics.get("requests_per_sec"), after_metrics.get("requests_per_sec")), + "transfer_per_sec_mb": delta( + before_metrics.get("transfer_per_sec_mb"), + after_metrics.get("transfer_per_sec_mb"), + ), + "latency_avg_ms": delta( + before_metrics.get("latency_ms", {}).get("avg"), + after_metrics.get("latency_ms", {}).get("avg"), + ), + "latency_p50_ms": delta( + before_metrics.get("percentiles_ms", {}).get("50.000"), + after_metrics.get("percentiles_ms", {}).get("50.000"), + ), + "latency_p90_ms": delta( + before_metrics.get("percentiles_ms", {}).get("90.000"), + after_metrics.get("percentiles_ms", {}).get("90.000"), + ), + "latency_p99_ms": delta( + before_metrics.get("percentiles_ms", {}).get("99.000"), + after_metrics.get("percentiles_ms", {}).get("99.000"), + ), + # metrics specific to JMeter + "error_rate": delta(before_metrics.get("error_rate"), after_metrics.get("error_rate")), + } + return comparison + + +def write_benchmark_report( + output_dir: Path, + before: Dict[str, Any], + after: Dict[str, Any], + comparison: Dict[str, Any], +) -> Dict[str, Any]: + """Write benchmark summary artifacts to disk.""" + output_dir.mkdir(parents=True, exist_ok=True) + report_path = output_dir / "benchmark_report.json" + comparison_path = output_dir / "benchmark_comparison.json" + + report_path.write_text(json.dumps({"before": before, "after": after}, indent=2), encoding="utf-8") + comparison_path.write_text(json.dumps(comparison, indent=2), encoding="utf-8") + + return { + "benchmark_report": report_path.as_posix(), + "benchmark_comparison": comparison_path.as_posix(), + } + + +def render_benchmark_charts( + output_dir: Path, + comparison: Dict[str, Any], + before: Dict[str, Any], + after: Dict[str, Any], +) -> Dict[str, Any]: + """Render benchmark comparison charts (optional, requires matplotlib).""" + try: + import matplotlib.pyplot as plt # type: ignore + except Exception as exc: + return {"error": f"matplotlib_not_installed: {exc}", "charts": []} + + charts = [] + output_dir.mkdir(parents=True, exist_ok=True) + + latency_items = [ + ("p50", comparison.get("latency_p50_ms")), + ("p90", comparison.get("latency_p90_ms")), + ("p99", comparison.get("latency_p99_ms")), + ("avg", comparison.get("latency_avg_ms")), + ] + labels = [ + name + for name, item in latency_items + if item and (item.get("before") is not None or item.get("after") is not None) + ] + before_vals = [ + item.get("before") or 0.0 + for name, item in latency_items + if item and (item.get("before") is not None or item.get("after") is not None) + ] + after_vals = [ + item.get("after") or 0.0 + for name, item in latency_items + if item and (item.get("before") is not None or item.get("after") is not None) + ] + + if labels: + fig, ax = plt.subplots(figsize=(6, 4)) + x = range(len(labels)) + ax.bar([i - 0.2 for i in x], before_vals, width=0.4, label="Before") + ax.bar([i + 0.2 for i in x], after_vals, width=0.4, label="After") + ax.set_xticks(list(x)) + ax.set_xticklabels(labels) + ax.set_ylabel("Latency (ms)") + ax.set_title("Latency Comparison") + ax.legend() + chart_path = output_dir / "latency_comparison.png" + fig.tight_layout() + fig.savefig(chart_path, dpi=140) + plt.close(fig) + charts.append(chart_path.as_posix()) + + throughput_items = [ + ("rps", comparison.get("requests_per_sec")), + ("MB/s", comparison.get("transfer_per_sec_mb")), + ] + labels = [ + name + for name, item in throughput_items + if item and (item.get("before") is not None or item.get("after") is not None) + ] + before_vals = [ + item.get("before") or 0.0 + for name, item in throughput_items + if item and (item.get("before") is not None or item.get("after") is not None) + ] + after_vals = [ + item.get("after") or 0.0 + for name, item in throughput_items + if item and (item.get("before") is not None or item.get("after") is not None) + ] + + if labels: + fig, ax = plt.subplots(figsize=(6, 4)) + x = range(len(labels)) + ax.bar([i - 0.2 for i in x], before_vals, width=0.4, label="Before") + ax.bar([i + 0.2 for i in x], after_vals, width=0.4, label="After") + ax.set_xticks(list(x)) + ax.set_xticklabels(labels) + ax.set_ylabel("Throughput") + ax.set_title("Throughput Comparison") + ax.legend() + chart_path = output_dir / "throughput_comparison.png" + fig.tight_layout() + fig.savefig(chart_path, dpi=140) + plt.close(fig) + charts.append(chart_path.as_posix()) + + return {"charts": charts} + + +def _run_benchmark_command_impl( + command: str | List[str], + command_env: str, + workdir: str, + timeout_seconds: int, + output_dir: str, + label: str, +) -> Dict[str, Any]: + cwd = Path(workdir).resolve() if workdir else Path.cwd().resolve() + + # Auto-detection for default command + if not command: + # Check if we are in or near TeaStore + teastore_dir = cwd / "TeaStore" + teastore_parent = cwd.parent / "TeaStore" + + is_teastore = False + if cwd.name == "TeaStore" or (cwd / "examples" / "jmeter").exists(): + is_teastore = True + elif teastore_dir.exists(): + cwd = teastore_dir + is_teastore = True + elif teastore_parent.exists(): + cwd = teastore_parent + is_teastore = True + + if is_teastore: + # Default TeaStore JMeter command + jmeter_script = "examples/jmeter/teastore_browse_nogui.jmx" + if (cwd / jmeter_script).exists(): + command = f"jmeter -n -t {jmeter_script} -Jhostname localhost -Jport 8080 -JnumUser 10 -JrampUp 1" + else: + # Fallback or error if script not found? + pass + else: + # Default to existing behavior (DeathStarBench env var or empty) + command = os.environ.get(command_env, "") + + if not command: + # Double check if command_env is set even if we didn't suspect TeaStore + command = os.environ.get(command_env, "") + + use_shell = isinstance(command, str) + argv = _normalize_command(command) if not use_shell else [] + if (use_shell and not command) or (not use_shell and not argv): + return {"error": "empty_command", "command_env": command_env} + + if not cwd.exists(): + return {"error": "workdir_not_found", "workdir": str(cwd)} + + start = time.monotonic() + timed_out = False + try: + result = subprocess.run( + command if use_shell else argv, + capture_output=True, + text=True, + cwd=str(cwd), + timeout=timeout_seconds, + shell=use_shell, + executable="/bin/bash" if use_shell else None, + ) + stdout_text = result.stdout or "" + stderr_text = result.stderr or "" + exit_code = result.returncode + except subprocess.TimeoutExpired as exc: + timed_out = True + stdout_text = exc.stdout or "" + stderr_text = exc.stderr or "" + if isinstance(stdout_text, bytes): + stdout_text = stdout_text.decode("utf-8", errors="replace") + if isinstance(stderr_text, bytes): + stderr_text = stderr_text.decode("utf-8", errors="replace") + exit_code = None + except FileNotFoundError: + return {"error": "command_not_found", "command": argv} + except Exception as exc: + return {"error": "command_failed", "detail": str(exc)} + + duration = time.monotonic() - start + + # Detect parser + cmd_str = command if use_shell else " ".join(argv) + if "jmeter" in cmd_str: + metrics = parse_jmeter_output(stdout_text) + else: + metrics = parse_wrk_output(stdout_text) or parse_wrk_output(stderr_text) + + payload: Dict[str, Any] = { + "command": command if use_shell else argv, + "workdir": str(cwd), + "exit_code": exit_code, + "timed_out": timed_out, + "duration_seconds": round(duration, 3), + "stdout": stdout_text, + "stderr": stderr_text, + "metrics": metrics, + } + + if output_dir: + out_dir = Path(output_dir) + out_dir.mkdir(parents=True, exist_ok=True) + stdout_path = out_dir / f"{label}_stdout.txt" + stderr_path = out_dir / f"{label}_stderr.txt" + stdout_path.write_text(stdout_text or "", encoding="utf-8") + stderr_path.write_text(stderr_text or "", encoding="utf-8") + payload["stdout_path"] = stdout_path.as_posix() + payload["stderr_path"] = stderr_path.as_posix() + + return payload + + +@tool +async def run_benchmark_command( + command: str | List[str] = "", + command_env: str = "BENCHMARK_CMD", + workdir: str = "", + timeout_seconds: int = 1800, + output_dir: str = "", + label: str = "benchmark", +) -> str: + """Run an external benchmark command and return stdout/stderr metadata. + + Args: + command (str | List[str]): Command string or argv list to execute. If empty, loads from command_env. + command_env (str): Environment variable containing the command string. Default: BENCHMARK_CMD. + workdir (str): Working directory for the command. Defaults to current directory. + timeout_seconds (int): Timeout in seconds. Default: 1800. + output_dir (str): Optional directory to write stdout/stderr files. + label (str): Label prefix for output files when output_dir is set. + + Returns: + str: JSON string with execution metadata and outputs. + """ + payload = await asyncio.to_thread( + _run_benchmark_command_impl, + command, + command_env, + workdir, + timeout_seconds, + output_dir, + label, + ) + return json.dumps(payload) + + +@tool +async def run_benchmark_comparison( + command_env: str = "BENCHMARK_CMD", + workdir: str = "", + timeout_seconds: int = 1800, + output_dir: str = "", + render_charts: bool = True, +) -> str: + """Run benchmark commands before/after and return comparison payload.""" + command_before = os.environ.get(command_env, "") + command_after = os.environ.get(command_env, "") + + before = await asyncio.to_thread( + _run_benchmark_command_impl, + command_before, + command_env, + workdir, + timeout_seconds, + output_dir, + "before", + ) + after = await asyncio.to_thread( + _run_benchmark_command_impl, + command_after, + command_env, + workdir, + timeout_seconds, + output_dir, + "after", + ) + + if "error" in before: + return json.dumps({"error": "before_failed", "before": before}) + if "error" in after: + return json.dumps({"error": "after_failed", "after": after}) + + comparison = compare_benchmark_metrics(before, after) + payload: Dict[str, Any] = { + "before": before, + "after": after, + "comparison": comparison, + } + + if output_dir: + report_paths = write_benchmark_report( + Path(output_dir), before, after, comparison + ) + payload["report_paths"] = report_paths + if render_charts: + payload["charts"] = render_benchmark_charts( + Path(output_dir), comparison, before, after + ) + + return json.dumps(payload) + diff --git a/workflows/complete_pipeline.py b/workflows/complete_pipeline.py index 5e8089b..146cd1e 100644 --- a/workflows/complete_pipeline.py +++ b/workflows/complete_pipeline.py @@ -1,11 +1,13 @@ """LangGraph-based orchestrator for the complete optimization pipeline. -This module implements a four-phase iterative pipeline: -1. PHASE 1 - SUMMARIZATION: Run environment, component, and behavior summarizers in parallel -2. PHASE 2 - ANALYSIS: Analyze summaries to produce optimization guidance with static signals -3. PHASE 3 - OPTIMIZATION: Apply safe code improvements based on analysis -4. PHASE 4 - CORRECTNESS CHECK: Validate applied changes for correctness and quality -5. LOOP: Conditionally loop back to summarization or exit based on iteration count +This module implements a six-phase iterative pipeline: +1. PHASE 1 - BENCHMARK (BEFORE): Run external benchmark command to capture baseline +2. PHASE 2 - SUMMARIZATION: Run environment, component, and behavior summarizers in parallel +3. PHASE 3 - ANALYSIS: Analyze summaries to produce optimization guidance with static signals +4. PHASE 4 - OPTIMIZATION: Apply safe code improvements based on analysis +5. PHASE 5 - CORRECTNESS CHECK: Validate applied changes for correctness and quality +6. PHASE 6 - BENCHMARK (AFTER): Run external benchmark command to capture post-change +7. LOOP: Conditionally loop back to benchmark-before or exit based on iteration count The pipeline uses temporary files for inter-agent communication and manages state flow through a LangGraph workflow with iterative refinement. @@ -24,6 +26,8 @@ from agents import (AnalysisReport, AnalyzerAgent, OptimizationReport, OptimizerAgent, orchestrate_code_correctness) +from tools.benchmark import (compare_benchmark_metrics, render_benchmark_charts, + run_benchmark_command, write_benchmark_report) from .summary_orchestrator import orchestrate_summarizers @@ -79,6 +83,7 @@ def build_pipeline_state(phases: list[PipelinePhase]) -> Type[TypedDict]: state_fields: dict[str, Type] = { "code_path": str, "iteration_count": int, + "benchmark_output_dir": str, } # Add state fields from each phase's return fields @@ -91,6 +96,7 @@ def build_pipeline_state(phases: list[PipelinePhase]) -> Type[TypedDict]: # Define pipeline phases in execution order _PIPELINE_PHASES = [ + PipelinePhase("benchmark_before", None, return_fields=["benchmark_before"]), PipelinePhase("summarization", None, return_fields=["summary_text"]), PipelinePhase( "analysis", @@ -107,15 +113,31 @@ def build_pipeline_state(phases: list[PipelinePhase]) -> Type[TypedDict]: PipelinePhase( "correctness_check", None, - return_fields=["analysis_result", "correctness_verdict"], + return_fields=["analysis_result", "correctness_verdict", "correctness_report"], return_class=tuple, # Composite: (AnalysisResult, CorrectnessVerdict) ), + PipelinePhase("benchmark_after", None, return_fields=["benchmark_after", "benchmark_report"]), ] # Dynamically build PipelineState from phase definitions PipelineState = build_pipeline_state(_PIPELINE_PHASES) +async def benchmark_before_node(state: PipelineState) -> dict: + """PHASE 1: Run baseline benchmark before optimization.""" + logger.info("PHASE 1: Running baseline benchmark (before)") + output_dir = state.get("benchmark_output_dir", "") + payload = { + "command_env": "BENCHMARK_CMD", + "workdir": state["code_path"], + "label": "before", + "output_dir": output_dir, + } + result = await run_benchmark_command.ainvoke(payload) + logger.info("PHASE 1: Baseline benchmark complete") + return {"benchmark_before": result} + + async def summary_node(state: PipelineState) -> dict: """PHASE 1: Run summarization workflow and combine summaries. @@ -130,7 +152,7 @@ async def summary_node(state: PipelineState) -> dict: Dictionary with combined summary_text and incremented iteration_count """ iteration = state.get("iteration_count", 0) + 1 - logger.info(f"PHASE 1: Running summarization orchestrator (iteration {iteration}/{MAX_ITERATIONS})") + logger.info(f"PHASE 2: Running summarization orchestrator (iteration {iteration}/{MAX_ITERATIONS})") summaries = await orchestrate_summarizers(state["code_path"]) summary_text = ( @@ -142,7 +164,7 @@ async def summary_node(state: PipelineState) -> dict: f"{summaries.get('behavior_summary', '')}\n" ) - logger.info(f"PHASE 1: Summarization complete ({len(summary_text)} chars)") + logger.info(f"PHASE 2: Summarization complete ({len(summary_text)} chars)") return {"summary_text": summary_text, "iteration_count": iteration} @@ -165,7 +187,7 @@ async def analyze_node(state: PipelineState) -> dict: Returns: Dictionary with analysis_report (JSON string) """ - logger.info("PHASE 2: Running AnalyzerAgent") + logger.info("PHASE 3: Running AnalyzerAgent") agent = AnalyzerAgent() with tempfile.TemporaryDirectory(prefix="analysis_inputs_") as temp_dir: @@ -175,7 +197,7 @@ async def analyze_node(state: PipelineState) -> dict: # Write summary to temporary file summary_path.write_text(state.get("summary_text", ""), encoding="utf-8") - logger.info("PHASE 2: Analyzer input files created") + logger.info("PHASE 3: Analyzer input files created") # Create payload with file paths for the agent to read payload = { @@ -184,7 +206,7 @@ async def analyze_node(state: PipelineState) -> dict: } result = await agent.run(json.dumps(payload)) - logger.info("PHASE 2: Analysis complete") + logger.info("PHASE 3: Analysis complete") return {"analysis_report": result} @@ -210,7 +232,7 @@ async def optimize_node(state: PipelineState) -> dict: Returns: Dictionary with optimization_report (JSON string) """ - logger.info("PHASE 3: Running OptimizerAgent") + logger.info("PHASE 4: Running OptimizerAgent") agent = OptimizerAgent() with tempfile.TemporaryDirectory(prefix="optimizer_inputs_") as temp_dir: @@ -222,7 +244,7 @@ async def optimize_node(state: PipelineState) -> dict: analysis_path.write_text(state.get("analysis_report", ""), encoding="utf-8") summary_path.write_text(state.get("summary_text", ""), encoding="utf-8") - logger.info("PHASE 3: Optimizer input files created") + logger.info("PHASE 4: Optimizer input files created") # Create payload with file paths for the agent to read payload = { @@ -232,13 +254,64 @@ async def optimize_node(state: PipelineState) -> dict: } result = await agent.run(json.dumps(payload)) - logger.info("PHASE 3: Optimization complete") + logger.info("PHASE 4: Optimization complete") return {"optimization_report": result} +async def benchmark_after_node(state: PipelineState) -> dict: + """PHASE 6: Run benchmark after optimization and compare with baseline.""" + logger.info("PHASE 6: Running post-optimization benchmark (after)") + output_dir = state.get("benchmark_output_dir", "") + payload = { + "command_env": "BENCHMARK_CMD", + "workdir": state["code_path"], + "label": "after", + "output_dir": output_dir, + } + after_raw = await run_benchmark_command.ainvoke(payload) + logger.info("PHASE 6: Post-optimization benchmark complete") + + before_raw = state.get("benchmark_before", "") + try: + before = json.loads(before_raw) if before_raw else {} + except json.JSONDecodeError: + before = {} + try: + after = json.loads(after_raw) if after_raw else {} + except json.JSONDecodeError: + after = {} + + comparison = compare_benchmark_metrics(before, after) + report = { + "benchmark_name": Path(state["code_path"]).name, + "command_env": "BENCHMARK_CMD", + "workdir": state["code_path"], + "before": before, + "after": after, + "comparison": comparison, + } + + report_paths = {} + charts = {} + if output_dir: + report_paths = write_benchmark_report( + Path(output_dir), before, after, comparison + ) + charts = render_benchmark_charts( + Path(output_dir), comparison, before, after + ) + report["report_paths"] = report_paths + report["charts"] = charts + + return { + "benchmark_after": after_raw, + "benchmark_report": json.dumps(report, indent=2), + } + + async def correctness_check_node(state: PipelineState) -> dict: - """PHASE 4: Run code correctness workflow. + """PHASE 5: Run code correctness workflow. Orchestrates a two-phase correctness check: - Phase 4a: Detailed analysis of applied code changes @@ -250,7 +323,7 @@ async def correctness_check_node(state: PipelineState) -> dict: Returns: Dictionary with analysis_result and correctness_verdict """ - logger.info("PHASE 4: Running code correctness workflow") + logger.info("PHASE 5: Running code correctness workflow") # Parse the optimization report to extract applied changes try: @@ -284,11 +357,17 @@ async def correctness_check_node(state: PipelineState) -> dict: code_snippet=code_snippet, ) - logger.info("PHASE 4: Code correctness workflow complete") + logger.info("PHASE 5: Code correctness workflow complete") return { "analysis_result": correctness_state.get("analysis_result", ""), "correctness_verdict": correctness_state.get("correctness_verdict", ""), + "correctness_report": json.dumps( + { + "analysis_result": correctness_state.get("analysis_result", ""), + "correctness_verdict": correctness_state.get("correctness_verdict", ""), + } + ), } @@ -316,23 +395,27 @@ def should_continue_loop(state: PipelineState) -> Literal["summarization", str]: def build_complete_pipeline() -> CompiledStateGraph: - """Build the complete four-phase optimization pipeline with iteration loop. + """Build the complete six-phase optimization pipeline with iteration loop. Constructs a LangGraph StateGraph from phase definitions that orchestrates: - 1. Summarization (parallel environment, component, behavior summaries) - 2. Analysis (structured guidance based on summaries with static signals) - 3. Optimization (apply safe code improvements) - 4. Correctness Check (orchestrates pain analysis and structured verdict) - 5. Loop decision (conditionally loop back to summarization or end) + 1. Benchmark (before) (run external benchmark command) + 2. Summarization (parallel environment, component, behavior summaries) + 3. Analysis (structured guidance based on summaries with static signals) + 4. Optimization (apply safe code improvements) + 5. Correctness Check (orchestrates pain analysis and structured verdict) + 6. Benchmark (after) (run external benchmark command) + 7. Loop decision (conditionally loop back to benchmark-before or end) The workflow follows a sequential DAG with conditional loop: - START → summarization → analysis → optimization → correctness_check → (loop decision) → {summarization | END} + START → benchmark_before → summarization → analysis → optimization → correctness_check → benchmark_after → (loop decision) → {summarization | END} State fields are dynamically generated from phase definitions: + - benchmark_before → benchmark_before - summarization → summary_text - analysis → analysis_report - optimization → optimization_report - - correctness_check → pain_analysis_result, correctness_verdict + - correctness_check → analysis_result, correctness_verdict, correctness_report + - benchmark_after → benchmark_after, benchmark_report Returns: Compiled LangGraph StateGraph ready for invocation @@ -342,10 +425,12 @@ def build_complete_pipeline() -> CompiledStateGraph: # Map phase names to node functions node_functions = { + "benchmark_before": benchmark_before_node, "summarization": summary_node, "analysis": analyze_node, "optimization": optimize_node, "correctness_check": correctness_check_node, + "benchmark_after": benchmark_after_node, } # Add nodes from phase definitions @@ -382,7 +467,7 @@ def build_complete_pipeline() -> CompiledStateGraph: return workflow.compile() -def _build_initial_state(code_path: str) -> dict[str, Any]: +def _build_initial_state(code_path: str, benchmark_output_dir: str = "") -> dict[str, Any]: """Build initial state with all dynamic fields initialized. Args: @@ -394,6 +479,7 @@ def _build_initial_state(code_path: str) -> dict[str, Any]: initial_state = { "code_path": code_path, "iteration_count": 0, + "benchmark_output_dir": benchmark_output_dir, } # Initialize all return fields from phases with empty strings @@ -404,16 +490,20 @@ def _build_initial_state(code_path: str) -> dict[str, Any]: return initial_state -async def orchestrate_complete_pipeline(code_path: str) -> PipelineState: +async def orchestrate_complete_pipeline( + code_path: str, benchmark_output_dir: str = "" +) -> PipelineState: """Run the complete optimization pipeline end-to-end. Executes the optimization pipeline with iterative refinement through dynamically configured phases: - 1. Summarization: Generates environment, component, and behavior summaries - 2. Analysis: Produces structured optimization guidance with static analysis - 3. Optimization: Applies safe code improvements - 4. Correctness Check: Validates applied changes for correctness and quality - 5. Loop: Conditionally iterates up to MAX_ITERATIONS times + 1. Benchmark (before): Captures baseline benchmark metrics + 2. Summarization: Generates environment, component, and behavior summaries + 3. Analysis: Produces structured optimization guidance with static analysis + 4. Optimization: Applies safe code improvements + 5. Correctness Check: Validates applied changes for correctness and quality + 6. Benchmark (after): Runs external benchmark command if configured + 7. Loop: Conditionally iterates up to MAX_ITERATIONS times State fields are dynamically generated from phase definitions: - Phase outputs are automatically added to state based on agent.return_state_field @@ -426,11 +516,15 @@ async def orchestrate_complete_pipeline(code_path: str) -> PipelineState: Returns: Final pipeline state containing dynamically generated fields: - code_path: Input repository path + - benchmark_before: Baseline benchmark JSON from final iteration - summary_text: Combined summaries from final iteration - analysis_report: Structured AnalysisReport JSON from final iteration - optimization_report: Final OptimizationReport JSON from final iteration - analysis_result: Detailed correctness analysis from final iteration - correctness_verdict: Yes/No verdict from final iteration + - correctness_report: Combined correctness report JSON from final iteration + - benchmark_after: Post-optimization benchmark JSON from final iteration + - benchmark_report: Combined benchmark comparison JSON from final iteration - iteration_count: Number of iterations completed """ logger.info(f"Starting complete optimization pipeline for {code_path}") @@ -439,7 +533,7 @@ async def orchestrate_complete_pipeline(code_path: str) -> PipelineState: workflow = build_complete_pipeline() - initial_state = _build_initial_state(code_path) + initial_state = _build_initial_state(code_path, benchmark_output_dir) final_state: PipelineState = await workflow.ainvoke(initial_state) logger.info("Complete optimization pipeline finished")