@unpublished{Robusta,
author = {Karimi, Pantea and Rouhana, Dany and Namyar, Pooria and Kakarla, Siva Kesava Reddy and Arun, Venkat and Arzani, Behnaz},
title = {Robust Heuristic Algorithm Design with LLMs},
journal = {arXiv:2510.08755},
doi = {2510.08755},
year = {2025},
file = {Robusta.pdf}
}
We posit that we can generate more robust and performant heuristics if we augment approaches using LLMs for heuristic design with tools that explain why heuristics underperform and suggestions about how to fix them. We find even simple ideas that (1) expose the LLM to instances where the heuristic underperforms; (2) explain why they occur; and (3) specialize design to regions in the input space, can produce more robust algorithms compared to existing techniques — the heuristics we produce have a 28x better worst-case performance compared to FunSearch, improve average performance, and maintain the runtime.
@unpublished{Savaal,
author = {Noorbakhsh*, Kimia and Chandler*, Joseph and Karimi*, Pantea and Alizadeh, Mohammad and Balakrishnan, Hari},
title = {Savaal: Scalable Concept-Driven Question Generation to Enhance Human Learning},
journal = {arXiv:2502.12477},
doi = {2502.12477},
year = {2025},
file = {savaal_arxiv.pdf}
}
Assessing and enhancing human learning through question-answering is vital, yet automating this process remains challenging. While large language models (LLMs) excel at summarization and query responses, their ability to generate meaningful questions for learners is underexplored. We propose Savaal, a scalable question-generation system with three objectives: (i) scalability, enabling question generation from hundreds of pages of text (ii) depth of understanding, producing questions beyond factual recall to test conceptual reasoning, and (iii) domain-independence, automatically generating questions across diverse knowledge areas. Instead of providing an LLM with large documents as context, Savaal improves results with a three-stage processing pipeline. Our evaluation with 76 human experts on 71 papers and PhD dissertations shows that Savaal generates questions that better test depth of understanding by 6.5X for dissertations and 1.5X for papers compared to a direct-prompting LLM baseline. Notably, as document length increases, Savaal’s advantages in higher question quality and lower cost become more pronounced.
@unpublished{EPFL_Report,
author = {Karimi, Pantea},
title = {Location Verification using Latencies and Claimed Coordinates on the Blockchain},
journal = {EPFL Internship Report},
year = {2019},
file = {EPFL_Report.pdf}
}
In many applications such as processing of transactions, speed is an important factor. In self-organizing communities without relying on a central party, the processing of transactions can be performed by a set of validators. To reach higher speed in transaction validation, a Trust-but-Verify approach can be taken. For example in paying for the daily purchase in the local supermarket with cryptocurrency, a user can rely on the local consensus and enjoy the fast processing of the transaction, but can still verify the global state to make sure at the end that the transaction was also verified in the global consensus and was not a double-spend. However, the user has a choice not to trust the local consensus and wait longer for the global consensus. In the Trus-but-Verify approach, nearby validators can start processing the transactions and provide a fast, weak, and temporary proof, while the global verification is being computed by all the validators in the system. To find all the nearby validators for local consensus, each validator should know its latencies to all the other validators in the system. The primary goal of this project is to devise a scalable fault-tolerant algorithm to estimate the pair- wise latencies among all the nodes of the system. The challenge is that some nodes are malicious and behave arbitrarily, including attempts to mislead others about their latencies.
@published{Glia,
author = {Hamadanian*, Pouya and Karimi*, Pantea and Nasr-Esfahany*, Arash and Noorbakhsh*, Kimia and Chandler, Joseph and ParandehGheibi, Ali and Alizadeh, Mohammad and Balakrishnan, Hari},
title = {Glia: A Human-Inspired AI for Automated Systems Design and Optimization},
journal = {ACM Conference on AI and Agentic Systems (CAIS'26)},
doi = {2510.27176},
git = {https://github.com/mit-nms/Engram/tree/glia},
year = {2026},
file = {Glia.pdf}
}
Can an AI autonomously design mechanisms for computer systems on par with the creativity and reasoning of human experts? We present Glia, an AI architecture for networked systems design that uses large language models (LLMs) in a human-inspired, multi-agent workflow. Each agent specializes in reasoning, experimentation, and analysis, collaborating through an evaluation framework that grounds abstract reasoning in empirical feedback. Unlike prior ML-for-systems methods that optimize black-box policies, Glia generates interpretable designs and exposes its reasoning process. When applied to a distributed GPU cluster for LLM inference, it produces new algorithms for request routing, scheduling, and auto-scaling that perform at human-expert levels in significantly less time, while yielding novel insights into workload behavior. Our results suggest that by combining reasoning LLMs with structured experimentation, an AI can produce creative and understandable designs for complex systems problems.
@published{Engram,
author = {Karimi*, Pantea and Noorbakhsh*, Kimia and Alizadeh, Mohammad and Balakrishnan, Hari},
title = {Improving Coherence and Persistence in Agentic AI for System Optimization},
journal = {ACM Conference on AI and Agentic Systems (CAIS'26)},
doi = {https://arxiv.org/pdf/2603.21321},
git = {https://github.com/mit-nms/Engram},
year = {2026}
}
Designing high-performance system heuristics is a creative, iterative process requiring experts to form hypotheses and execute multi-step conceptual shifts. While Large Language Models (LLMs) show promise in automating this loop, they struggle with complex system problems due to two critical failure modes: evolutionary neighborhood bias and the coherence ceiling. Evolutionary methods often remain trapped in local optima by relying on scalar benchmark scores, failing when coordinated multi-step changes are required. Conversely, existing agentic frameworks suffer from context degradation over long horizons or fail to accumulate knowledge across independent runs. We present Engram, an agentic researcher architecture that addresses these limitations by decoupling long-horizon exploration from the constraints of a single context window. Engram organizes exploration into a sequence of agents that iteratively design, test, and analyze mechanisms. At the conclusion of each run, an agent stores code snapshots, logs, and results in a persistent Archive and distills high-level modeling insights into a compact, persistent Research Digest. Subsequent agents then begin with a fresh context window, reading the Research Digest to build on prior discoveries. We find that Engram exhibits superior performance across diverse domains including multi-cloud multicast, LLM inference request routing, and optimizing KV cache reuse in databases with natural language queries.
@published{Reparo,
author = {Li, Tianhong and Sivaraman, Vibhaalakshmi and Karimi, Pantea and Fan, Lijie and Alizadeh, Mohammad and Katabi, Dina},
title = {Reparo: Loss-Resilient Generative Codec for Video Conferencing},
journal = {Machine Learning Systems (MLSys'26)},
year = {2026},
file = {Reparo.pdf}
}
Packet loss during video conferencing often leads to poor quality and video freezing. Attempting to retransmit lost packets is often impractical due to the need for real-time playback. Employing Forward Error Correction (FEC) for recovering the lost packets is challenging as it is difficult to determine the appropriate redundancy level. To address these issues, we introduce Reparo — a loss-resilient video conferencing framework based on generative deep learning models. Our approach involves generating missing information when a frame or part of a frame is lost. This generation is conditioned on the data received thus far, taking into account the model’s understanding of how people and objects appear and interact within the visual realm. Experimental results, using publicly available video conferencing datasets, demonstrate that Reparo outperforms state-of-the-art FEC-based video conferencing solutions in terms of both video quality (measured through PSNR, SSIM, and LPIPS) and the occurrence of video freezes
@published{MetaEase,
title = {Heuristic Analysis from Source Code via Symbolic-Guided Optimization},
author = {Karimi, Pantea and Kakarla, Siva Kesava Reddy and Namyar, Pooria and Segarra, Santiago and Beckett, Ryan and Alizadeh, Mohammad and Arzani, Behnaz},
journal = {USENIX Symposium on Networked Systems Design and Implementation (NSDI'26)},
year = {2026},
file = {MetaEase.pdf},
doi = {https://www.usenix.org/conference/nsdi26/presentation/karimi},
git = {https://github.com/microsoft/MetaEase/}
}
Large-scale systems rely on heuristics to tackle NP-hard problems such as traffic engineering, virtual machine placement, and packet scheduling. While these heuristics are efficient, they can exhibit severe performance gaps under certain workloads, leading to outages or costly over-provisioning. This risk has motivated tools that attempt to identify inputs causing worst-case underperformance. However, using these tools in practice often requires rewriting heuristics as formal mathematical models—a process that is time-consuming, error-prone, and excludes many real-world algorithms. We introduce MetaEase, a practical general-domain analyzer that works by directly analyzing a heuristic’s source code, eliminating the need for formal modeling. MetaEase combines code-aware input generation with guided search to uncover worst-case scenarios efficiently, even for heuristics with randomness (e.g., various traffic engineering schemes) or non-convex behavior (e.g., bin packing for virtual machine placement). Across five problem domains and eight heuristics, MetaEase matched or exceeded MetaOpt, a state-of-the-art optimization-based heuristic analyzer, in most cases; in the remainder, it achieved at least 85–98% of its performance and often ran faster. Against black-box optimization baselines, it won in 88% of settings and ranked in the top two otherwise. MetaEase analyzed Arrow, a widely studied networking heuristic, which cannot be analyzed by any of the state-of-the-art heuristic analyzers. We revealed previously unknown performance gaps in Arrow.
@published{Vidaptive,
author = {Karimi, Pantea and Fouladi, Sadjad and Sivaraman, Vibhaalakshmi and Alizadeh, Mohammad},
title = {Tight Loops, Smooth Streams: Responsive Congestion Control for Real-Time Video},
journal = {New Ideas in Networked Systems (NINeS'26)},
year = {2026},
file = {Vidaptive.pdf},
doi = {https://drops.dagstuhl.de/entities/document/10.4230/OASIcs.NINeS.2026.9}
}
Real-time video streaming relies on rate control mechanisms to adapt video bitrate to network capacity while maintaining high utilization and low delay. However, the current video rate controllers, such as Google Congestion Control (GCC) in WebRTC, are very slow to respond to network changes, leading to link under-utilization and latency spikes. While recent delay-based congestion control algorithms promise high efficiency and rapid adaptation to variable conditions, low-latency video applications have been unable to adopt these schemes due to the intertwined relationship between video encoders and rate control in current systems. This paper introduces Vidaptive, a new rate control mechanism designed for low-latency video applications. Vidaptive decouples packet transmission decisions from encoder output, injecting dummy padding traffic as needed to treat video streams akin to backlogged flows controlled by a delay-based congestion controller. Vidaptive then adapts the frame rate, resolution, and target bitrate of the encoder to align the video bitrate with the congestion controller’s sending rate. Our evaluations atop WebRTC show that, across a set of cellular traces, Vidaptive achieves 2x higher video bitrate and 1.6 dB higher PSNR, and it reduces 95th-percentile frame latency by 2.7s with a slight increase in median frame latency.
@published{Xplain,
title = {Towards Safer Heuristics With XPlain},
author = {Karimi*, Pantea and Pirelli*, Solal and Kakarla, Siva Kesava Reddy and Beckett, Ryan and Segarra, Santiago and Li, Beibin and Namyar, Pooria and Arzani, Behnaz},
journal = {Proceedings of the 23rd ACM Workshop on Hot Topics in Networks (HotNets'24)},
year = {2024},
doi = {https://dl.acm.org/doi/abs/10.1145/3696348.3696884},
file = {Xplain.pdf}
}
Many problems that cloud operators solve are computationally expensive, and operators often use heuristic algorithms (that are faster and scale better than optimal) to solve them more efficiently. Heuristic analyzers enable operators to find when and by how much their heuristics underperform. However, these tools do not provide enough detail for operators to mitigate the heuristic’s impact in practice: they only discover a single input instance that causes the heuristic to underperform (and not the full set), and they do not explain why. We propose XPlain, a tool that extends these analyzers and helps operators understand when and why their heuristics underperform. We present promising initial results that show such an extension is viable.
@published{Dumbo,
author = {Karimi, Pantea},
title = {Bridging the Gap Between Real-time Video and Backlogged Traffic Congestion Control},
journal = {Massachusetts Institute of Technology},
doi = {https://hdl.handle.net/1721.1/151675},
year = {2023},
file = {Dumbo.pdf}
}
Real-time video applications, such as video conferencing, have become essential to our daily lives, and ensuring reliable and high-quality video delivery in the face of network fluctuation and resource constraints is critical. However, video congestion control algorithms have been criticized for their sub-optimal performance in managing network congestion and maintaining satisfactory video quality and latency. At the same time, state-of-the-art congestion control algorithms have demonstrated remarkable performance improvements, effectively addressing network congestion challenges and enhancing the overall quality of data transmission. In this work, we first demonstrate why there is such a gap between the performance of congestion control schemes on backlogged flows compared to real-time video streams. Second, we present Dumbo, a design for reshaping the video traffic to look like backlogged traffic, thus enabling state-of-the-art delay-sensitive congestion control algorithms for real-time video. We implemented Dumbo atop WebRTC and evaluated it on emulated network conditions using real-world cellular network traces. Our results show that Dumbo in comparison with GCC achieves a 1.5 dB improvement in PSNR, 1.6 dB improvement in SSIM, 100 ms lower frame latency, 35x faster convergence time, 16% increase in the video bitrate, 32% increase in network utilization, and 4x reduction in the network queueing delay.
@published{Gemino,
author = {Sivaraman, Vibhaalakshmi and Karimi, Pantea and Venkatapathy, Vedantha and Khani, Mehrdad and Fouladi, Sadjad and Alizadeh, Mohammad and Durand, Frédo and Sze, Vivienne},
title = {Gemino: Practical and Robust Neural Compression for Video Conferencing},
journal = {USENIX Symposium on Networked Systems Design and Implementation (NSDI'24)},
doi = {https://www.usenix.org/conference/nsdi24},
git = {https://github.com/geminovc},
year = {2022},
file = {Gemino.pdf}
}
Video conferencing systems suffer from poor user experience when network conditions deteriorate because current video codecs simply cannot operate at extremely low bitrates. Recently, several neural alternatives have been proposed that reconstruct talking head videos at very low bitrates using sparse representations of each frame such as facial landmark information. However, these approaches produce poor reconstructions in scenarios with major movement or occlusions over the course of a call, and do not scale to higher resolutions. We design Gemino, a new neural compression system for video conferencing based on a novel high-frequency-conditional super-resolution pipeline. Gemino upsamples a very low-resolution version of each target frame while enhancing high-frequency details (e.g., skin texture, hair, etc.) based on information extracted from a single high-resolution reference image. We use a multi-scale architecture that runs different components of the model at different resolutions, allowing it to scale to resolutions comparable to 720p, and we personalize the model to learn specific details of each person, achieving much better fidelity at low bitrates. We implement Gemino atop aiortc, an open-source Python implementation of WebRTC, and show that it operates on 1024x1024 videos in real-time on a Titan X GPU, and achieves 2.2-5x lower bitrate than traditional video codecs for the same perceptual quality.
Pantea Karimi
Ph.D. Student
MIT CSAIL
MIT CSAIL 32-G918
32 Vassar St
Cambridge, USA
© 2026 Pantea Karimi