@comment{{This file has been generated by bib2bib 1.92}}
@comment{{Command line: C:\alldata\data\webbib\bib2bib-1.92.exe -oc conf -ob conf.bib -c '$type = "INPROCEEDINGS"' cormode.bib}}
  author = {Graham Cormode and Igor L. Markov},
  title = {Federated Calibration and Evaluation of Binary Classifiers},
  booktitle = {Proceedings of the {VLDB} Endowment},
  year = 2023,
  volume = 16,
  number = 11,
  pages = {3253--3265},
  url = {../papers/fedevalvldb.pdf},
  slides = {../slides/fedevalvldbslides.pdf},
  video = {},
  link = {},
  pubsite = {},
  abstract = {
      We address two major obstacles to practical deployment of AI-based models on distributed private data.
      Whether a model was trained by a federation of cooperating clients or
      trained centrally,
      (1) the output scores must be calibrated, and (2)  performance metrics must be evaluated --- all without assembling labels in one place. 
      In particular, we show how to perform calibration and compute the standard metrics of precision, recall, accuracy and ROC-AUC
      in the federated setting under three privacy models ($i$) secure aggregation, ($ii$) \red{distributed} differential privacy, ($iii$) local differential privacy. Our theorems and experiments clarify tradeoffs between privacy, accuracy, and data efficiency. 
    They also help decide if a given application has sufficient data to support federated calibration and evaluation.}
  author = {Wei-Ning Chen and Ayfer \"Ozg\"ur and Graham Cormode and Akash Bharadwaj},
  title = {The Communication Cost of Security and Privacy in Federated Frequency Estimation},
  year = 2023,
  booktitle = {AISTATS},
  url = {../papers/fedfreqest.pdf},
  link = {},
  link2 = {},
  pages = {4247--4274},
  abstract = {
   We consider the federated frequency estimation problem, where each user holds a private item $X_i$ from a size-$d$ domain and a server aims to estimate the empirical frequency (i.e., histogram) of $n$ items with $n \ll d$. Without any security and privacy considerations, each user can communicate their item to the server by using $\log d$ bits. A naive application of secure aggregation protocols would, however, require $d\log n$ bits per user. Can we reduce the communication needed for secure aggregation, and does security come with a fundamental cost in communication?  
 In this paper, we develop an information-theoretic model for secure aggregation that allows us to characterize the fundamental cost of security and privacy in terms of communication. We show that with security (and without privacy) $\Omega\left( n \log d \right)$ bits per user are necessary and sufficient to allow the server to compute the frequency distribution. This is significantly smaller than the $d\log n$ bits per user needed by the naive scheme, but significantly higher than the $\log d$ bits per user needed without security. To achieve differential privacy, we construct a linear scheme based on a noisy sketch that locally perturbs the data and does not require a trusted server (a.k.a. distributed differential privacy). We analyze this scheme under $L_2$ and $L_{infinty}$ loss. By using our information-theoretic framework, we show that the scheme achieves the optimal accuracy-privacy trade-off with optimal communication cost, while matching the performance in the centralized case where data is stored in the central server.
  author = {Jonathan Hehir and Daniel Ting and Graham Cormode},
  title = {Sketch-Flip-Merge: Mergeable Sketches for Private Distinct Counting},
  booktitle = {International Conference on Machine Learning, ({ICML})},
  year = 2023,
  link = {},
  url = {../papers/f0dp.pdf},
  abstract = {
    Data sketching is a critical tool for distinct counting, enabling multisets to be represented by compact summaries that admit fast cardinality estimates. Because sketches may be merged to summarize multiset unions, they are a basic building block in data warehouses. Although many practical sketches for cardinality estimation exist, none provide privacy when merging. We propose the first practical cardinality sketches that are simultaneously mergeable, differentially private (DP), and have low empirical errors. These introduce a novel randomized algorithm for performing logical operations on noisy bits, a tight privacy analysis, and provably optimal estimation.
    Our sketches dramatically outperform existing theoretical solutions in simulations and on real-world data.
  author = {Ari Biswas and Graham Cormode},
  title = {Interactive Proofs For Differentially Private Counting},
  booktitle = {ACM Conference on Computer and Communications Security},
  year = 2023,
  url = {../papers/ip4dp.pdf},
  doi = {},
  link = {},
  abstract = {
    Differential Privacy (DP) is often presented as a strong privacy-enhancing technology with broad applicability and advocated as a de-facto standard for releasing aggregate statistics on sensitive data. 
    However, in many embodiments, DP introduces a new attack surface: a malicious entity entrusted with releasing statistics could manipulate the results and use the randomness of DP as a convenient smokescreen to mask its nefariousness. 
    Since revealing the random noise would obviate the purpose of introducing it, the miscreant may have a perfect alibi.  
    To close this loophole, we introduce the idea of \textit{Interactive Proofs For Differential Privacy}, which requires the publishing entity to output a zero knowledge proof that convinces an efficient verifier that the output is both DP and reliable.
    Such a definition might seem unachievable, as a verifier must validate that DP randomness was generated faithfully without learning anything about the randomness itself. 
    We resolve this paradox by carefully mixing private and public randomness to compute verifiable DP counting queries with theoretical guarantees and show that it is also practical for real-world deployment. 
We also demonstrate that computational assumptions are necessary by showing a separation between information-theoretic DP and computational DP under our definition of verifiability.
  author = {Kuntai Cai and Xiaokui Xiao and Graham Cormode},
  title = {PrivLava: Synthesizing Relational Data with Foreign Keys under Differential Privacy},
  booktitle = {{ACM} {SIGMOD} International 
                    Conference on Management of Data ({SIGMOD})},
  year = 2023,
  url = {../papers/privlava.pdf},
  link = {},
  abstract = {
    Answering database queries while preserving privacy is an important problem that has attracted considerable research attention in recent years. A canonical approach to this problem is to use {\it synthetic data}. That is, we replace the input database $\mathcal{R}$ with a synthetic database $\mathcal{R}^*$ that preserves the characteristics of $\mathcal{R}$, and use $\mathcal{R^*}$ to answer queries. Existing solutions for relational data synthesis, however, either fail to provide strong privacy protection, or assume that $\mathcal{R}$ contains a single relation. In addition, it is challenging to extend the existing single-relation solutions to the case of multiple relations, because they are unable to model the complex correlations induced by the foreign keys. Therefore, multi-relational data synthesis with strong privacy guarantees is an open problem.
    In this paper, we address the above open problem by proposing {\sf PrivLava}, the first solution for synthesizing relational data with foreign keys under {\it differential privacy}, a rigorous privacy framework widely adopted in both academia and industry. The key idea of {\sf PrivLava} is to model the data distribution in $\mathcal{R}$ using {\it graphical models}, with {\it latent variables} included to capture the inter-relational correlations caused by foreign keys. We show that {\sf PrivLava} supports arbitrary foreign key references that form a directed acyclic graph, and is able to tackle the common case when
$\mathcal{R}$ contains a mixture of public and private relations. Extensive experiments on census data sets and the TPC-H benchmark demonstrate that {\sf PrivLava} significantly outperforms its competitors in terms of the accuracy of aggregate queries processed on the synthetic data. 
  author = {Karthik Prasad and
               Sayan Ghosh and
               Graham Cormode and
               Ilya Mironov and
               Ashkan Yousefpour and
               Pierre Stock},
  title = {Reconciling Security and Communication Efficiency in Federated Learning},
  booktitle = {International Workshop on Federated Learning: Recent Advances and New Challenges in Conjunction with NeurIPS 2022},
  year = {2022},
  url = {},
  doi = {10.48550/arXiv.2207.12779},
  abstract = {Cross-device Federated Learning is an increasingly popular machine learning setting to train a model by leveraging a large population of client devices with high privacy and security guarantees. However, communication efficiency remains a major bottleneck when scaling federated learning to production environments, particularly due to bandwidth constraints during uplink communication. In this paper, we formalize and address the problem of compressing client-to-server model updates under the Secure Aggregation primitive, a core component of Federated Learning pipelines that allows the server to aggregate the client updates without accessing them individually. In particular, we adapt standard scalar quantization and pruning methods to Secure Aggregation and propose Secure Indexing, a variant of Secure Aggregation that supports quantization for extreme compression. We establish state-of-the-art results on LEAF benchmarks in a secure Federated Learning setup with up to 40× compression in uplink communication with no meaningful loss in utility compared to uncompressed baselines.}
  author = {Michael Shekelyan and Graham Cormode and Peter Triantafillou and Qingzhi Ma and Ali Mohammadi Shanghooshabad},
  title = {Streaming Weighted Sampling over Join Queries},
  booktitle = {EDBT},
  year = 2023,
  url = {../papers/streamingjoin.pdf},
  pages = {298-310},
  doi = {},
  abstract = {
  Join queries are a fundamental database tool, capturing a range of tasks that involve linking heterogeneous data sources. 
  However, with massive table sizes, it is often impractical to keep these in memory, and we can only take one or few streaming passes over them. 
  Moreover, building out the full join result (e.g., linking heterogeneous data sources along quasi-identifiers) can lead to a combinatorial explosion of results due to many-to-many links. 
  Random sampling is a natural tool to boil this oversized result down to a representative subset with well-understood statistical properties, but turns out to be a challenging task due to the combinatorial nature of the sampling domain. 
  Existing techniques in the literature focus solely on the setting with tabular data residing in main memory, and do not address aspects such as stream operation, weighted sampling and more general join operators that are urgently needed in a modern data processing context. 
  The main contribution of this work is to meet these needs with more lightweight practical approaches. 
  First, a bijection between the sampling problem and a graph problem is introduced to support weighted sampling and common join operators. 
  Second, the sampling techniques are refined to minimise the number of streaming passes. 
  Third, techniques are presented to deal with very large tables under limited memory. 
Finally, the proposed techniques are compared to existing approaches that rely on database indices and the results indicate substantial memory savings, reduced runtimes for ad-hoc queries and competitive amortised runtimes.}
  year = 2022,
  booktitle = {ACM Conference on Computer and Communications Security},
  author = {Samuel Maddock and Graham Cormode and Somesh Jha and Carsten Maple and Tianhao Wang},
  title = {Federated Boosted Decision Trees with Differential Privacy},
  link = {},
  url = {../papers/dpxgboost.pdf},
  abstract = {There is great demand for scalable, secure, and efficient privacy-preserving machine learning models that can be trained over distributed data. While deep learning models typically achieve the best results in a centralized non-secure setting, different models can excel when privacy and communication constraints are imposed. Instead, tree-based approaches such as XGBoost have attracted much attention for their high performance and ease of use; in particular, they often achieve state-of-the-art results on tabular data. Consequently, several recent works have focused on translating Gradient Boosted Decision Tree (GBDT) models like XGBoost into federated settings, via cryptographic mechanisms such as Homomorphic Encryption (HE) and Secure Multi-Party Computation (MPC).  
However, these do not always provide formal privacy guarantees, or consider the full range of hyperparameters and implementation settings. In this work, we implement the GBDT model under Differential Privacy (DP). We propose a general framework that captures and extends existing approaches for differentially private decision trees. Our framework of methods is tailored to the federated setting, and we show that with a careful choice of techniques it is possible to achieve very high utility while maintaining strong levels of privacy.}
  author = {Ashkan Yousefpour and Igor Shilov and Alex Sablayrolles and Davide Testuggine and Karthik Prasad and Mani Malek and John Nguyen and Sayan Ghosh and Akash Bharadwaj and Jessica Zhao and Graham Cormode and Ilya Mironov},
  title = {Opacus: User-Friendly Differential Privacy Library in PyTorch},
  booktitle = {Privacy in Machine Learning (NeurIPS workshop)},
  year = 2021,
  url = {../papers/opacus.pdf},
  abstract = {We introduce Opacus, a free, open-source PyTorch library for training deep learning models with differential privacy (hosted at \url{}). 
Opacus is designed for simplicity, flexibility, and speed. 
It provides a simple and user-friendly API, and enables machine learning practitioners to make a training pipeline private by adding as little as two lines to their code. 
It supports a wide variety of layers, including multi-head attention, convolution, LSTM, GRU (and generic RNN), and embedding, right out of the box and provides the means for supporting other user-defined layers. 
Opacus computes batched per-sample gradients, providing higher efficiency compared to the traditional ``micro batch'' approach. 
In this paper we present Opacus, detail the principles that drove its implementation and unique features, and benchmark it against other frameworks for training models with differential privacy as well as standard PyTorch. }
  author = {Lauren Watson and Chuan Guo and Graham Cormode and Alex Sablayrolles},
  title = {On the Importance of Difficulty Calibration in Membership Inference
  booktitle = {International Conference on Learning Representations, {ICLR}},
  year = {2022},
  url = {../papers/difficultycalibration.pdf},
  link = {},
  video = {},
  abstract = {The vulnerability of machine learning models to membership inference attacks has received much attention in recent years.
Existing attacks mostly remain impractical due to having high false positive rates, where non-member samples are often erroneously predicted as members. This type of error makes the predicted membership signal unreliable, especially since most samples are non-members in real world applications.
In this work, we argue that membership inference attacks can benefit drastically from \emph{difficulty calibration}, where an attack's predicted membership score is adjusted to the difficulty of correctly classifying the target sample. We show that difficulty calibration can significantly reduce the false positive rate of a variety of existing attacks without a loss in accuracy.}
  author = {Akash Bharadwaj and Graham Cormode},
  title = {Sample-and-threshold differential privacy: Histograms and applications},
  note = {(workshop version)},
  booktitle = {Privacy in Machine Learning (NeurIPS workshop)},
  year = 2021,
  url = {../papers/sntpriml.pdf},
  poster = {../slides/sntposter.pdf},
  video = {},
  abstract = {Federated analytics aims to compute accurate statistics from distributed datasets. 
A ``Differential Privacy'' (DP) guarantee is usually desired by the users of the devices storing the data. 
In this work, we prove a strong $(\epsilon, \delta)$-DP guarantee for a highly practical sampling-based procedure to derive histograms. We also provide accuracy guarantees and show how to apply the procedure to estimate quantiles and modes.}
  author = {Akash Bharadwaj and Graham Cormode},
  title = {Sample-and-threshold differential privacy: Histograms and applications},
  note = {(full version)},
  booktitle = {AISTATS},
  year = 2022,
  link = {},
  pubsite = {},
  url = {../papers/sntaistats.pdf},
  poster = {../slides/sntposter.pdf},
  video = {},
  abstract = {Federated analytics seeks to compute accurate statistics from data distributed across users' devices while providing a suitable privacy guarantee and being practically feasible to implement and scale. 
In this paper, we show how a strong $(\epsilon, \delta)$-Differential Privacy (DP) guarantee 
can be achieved for the fundamental problem of histogram generation in a federated setting, via a highly practical sampling-based procedure that does not add noise to disclosed data.
Given the ubiquity of sampling in practice, we thus obtain a DP guarantee almost for free, avoid over-estimating histogram counts, and allow easy reasoning about how privacy guarantees may obscure minorities and outliers.
Using such histograms, related problems such as heavy hitters and quantiles can be answered with provable error and privacy guarantees. 
Experimental results show that our sample-and-threshold approach is accurate and scalable.}
  author = {Huang, Ziyue and Qiu, Yuan and Yi, Ke and Cormode, Graham},
  title = {Frequency Estimation under Multiparty Differential Privacy: One-Shot and Streaming},
  year = {2022},
  publisher = {VLDB Endowment},
  volume = {15},
  url = {},
  doi = {10.14778/3547305.3547312},
  abstract = {We study the fundamental problem of frequency estimation under both privacy and communication constraints, where the data is distributed among $k$ parties. We consider two application scenarios: (1) one-shot, where the data is static and the aggregator conducts a one-time computation; and (2) streaming, where each party receives a stream of items over time and the aggregator continuously monitors the frequencies. We adopt the model of multiparty differential privacy (MDP), which is more general than local differential privacy (LDP) and (centralized) differential privacy. Our protocols achieve optimality (up to logarithmic factors) permissible by the more stringent of the two constraints. In particular, when specialized to the $\epsilon$-LDP model, our protocol achieves an error of $\sqrt{k}/(\exp(\Theta(\epsilon)) - 1)$ using $O(k max{\epsilon, log 1/\epsilon})$ bits of communication and $O(k \log u)$ bits of public randomness, where $u$ is the size of the domain.},
  booktitle = {International Conference on 
                  Very Large Data Bases ({VLDB})},
  pages = {2058–2070},
  numpages = {13},
  pubsite = {},
  url = {../papers/freqestmdp.pdf}
  title = {Bit-efficient Numerical Aggregation and Stronger Privacy for Trust in Federated Analytics},
  author = {Graham Cormode and Igor Markov},
  link = {},
  booktitle = {PPML Workshop},
  year = 2021,
  poster = {../slides/bitpushingposter21.pdf},
  url = {../papers/bitpushingpriml.pdf},
  abstract = {Private data generated by edge devices -- from smart phones to automotive electronics -- are highly informative when aggregated but can be damaging when mishandled. A variety of solutions are being explored but have not yet won the public's trust and full backing of mobile platforms. In this work, we propose numerical aggregation protocols that empirically improve upon prior art, while providing comparable local differential privacy guarantees. Sharing a single private bit per value supports privacy metering that enable privacy controls and guarantees that are not covered by differential privacy. We put emphasis on the ease of implementation, compatibility with existing methods, and compelling empirical performance.
  title = {Applying the Shuffle Model of Differential Privacy to Vector Aggregation},
  author = {Graham Cormode and Carsten Maple and Mary Scott},
  year = 2021,
  booktitle = {BICOD},
  pubsite = {},
  url = {../papers/bicod21.pdf},
  abstract = {
	In this work we introduce a new protocol for vector aggregation in the context of the Shuffle Model, a recent model within
Differential Privacy (DP). It sits between the Centralized Model, which prioritizes the level of accuracy over the secrecy of the
data, and the Local Model, for which an improvement in trust is counteracted by a much higher noise requirement. The
Shuffle Model was developed to provide a good balance between these two models through the addition of a shuffling step,
which unbinds the users from their data whilst maintaining a moderate noise requirement. We provide a single message
protocol for the summation of real vectors in the Shuffle Model, using advanced composition results. Our contribution
provides a mechanism to enable private aggregation and analysis across more sophisticated structures such as matrices and
higher-dimensional tensors, both of which are reliant on the functionality of the vector case.}
  title = {Real-World Trajectory Sharing with Local Differential Privacy},
  author = {Teddy Cunningham and Graham Cormode and Hakan Ferhatosmanoglu and Divesh Srivastava},
  booktitle = {International Conference on 
                  Very Large Data Bases ({VLDB})},
  year = 2021,
  url = {../papers/ldptrajectory.pdf},
  video = {},
  abstract = {Sharing trajectories is beneficial for many real-world applications,
such as managing disease spread through contact tracing and tailoring
public services to a population’s travel patterns. However,
public concern over privacy and data protection has limited the
extent to which this data is shared. Local differential privacy enables
data sharing in which users share a perturbed version of their
data, but existing mechanisms fail to incorporate user-independent
public knowledge (e.g., business locations and opening times, public
transport schedules, geo-located tweets). This limitation makes
mechanisms too restrictive, gives unrealistic outputs, and ultimately
leads to low practical utility. To address these concerns, we propose
a local differentially private mechanism that is based on perturbing
hierarchically-structured, overlapping $n$-grams (i.e., contiguous
subsequences of length $n$) of trajectory data. Our mechanism uses a
multi-dimensional hierarchy over publicly available external knowledge
of real-world places of interest to improve the realism and
utility of the perturbed, shared trajectories. Importantly, including
real-world public data does not negatively affect privacy or
efficiency. Our experiments, using real-world data and a range of
queries, each with real-world application analogues, demonstrate
the superiority of our approach over a range of competing methods.}
  title = {Frequency Estimation under Local Differential Privacy},
  author = {Graham Cormode and Sam Maddock and Carsten Maple},
  booktitle = {International Conference on 
                  Very Large Data Bases ({VLDB})},
  year = 2021,
  url = {../papers/ldphh.pdf},
  abstract = {Private collection of statistics from a large distributed population is an important problem, and has led to large scale deployments from several leading technology companies. 
The dominant approach requires each user to randomly perturb their input, leading to guarantees in the local differential privacy model. 
In this paper, we place the various approaches that have been suggested into a common framework, and perform an extensive series of experiments to understand the tradeoffs between different implementation choices. 
Our conclusion is that for the core problems of frequency estimation and heavy hitter identification, careful choice of algorithms can lead to very effective solutions that scale to millions of users.}
  title = {Privacy-Preserving Synthetic Location Data in the Real World},
  author = {Teddy Cunningham and Graham Cormode and Hakan Ferhatosmanoglu},
  booktitle = {Proceedings of International Symposium on Spatial and Temporal Databases},
  year = 2021,
  url = {../papers/psldrw.pdf},
  abstract = {Sharing sensitive data is vital in enabling many modern data analysis and machine learning tasks. 
current methods for data release are insufficiently accurate or granular to provide meaningful utility, and they carry a high risk of deanonymization or membership inference attacks.
In this paper, we propose a differentially private synthetic data generation solution with a focus on the compelling domain of location data.
We present two methods with high practical utility for generating synthetic location data from real locations, both of which protect the existence and true location of each individual in the original dataset. 
Our first, partitioning-based approach introduces a novel method for privately generating point data using kernel density estimation, in addition to employing private adaptations of classic statistical techniques, such as clustering, for private partitioning.
Our second, network-based approach incorporates public geographic information, such as the road network of a city, to constrain the bounds of synthetic data points and hence improve the accuracy of the synthetic data.
Both methods satisfy the requirements of differential privacy, while also enabling accurate generation of synthetic data that aims to preserve the distribution of the real locations.
We conduct experiments using three large-scale location datasets to show that the proposed solutions generate synthetic location data with high utility and strong similarity to the real datasets. 
We highlight some practical applications for our work by applying our synthetic data to a range of location analytics queries, and we demonstrate that our synthetic data produces near-identical answers to the same queries compared to when real data is used. 
Our results show that the proposed approaches are practical solutions for sharing and analyzing sensitive location data privately.}
  year = 2021,
  author = {Graham Cormode and Abhinav Mishra and Joseph Ross and Pavel Vesel{\'{y}}},
  booktitle = {{ACM} {SIGKDD} Conference},
  url = {../papers/theorypracticemedian.pdf},
  title = {Theory meets Practice at the Median:a worst case comparison of relative error quantile algorithms},
  link = {},
  abstract = {Estimating the distribution and quantiles of data is a foundational
task in data mining and data science. We study algorithms which
provide accurate results for extreme quantile queries using a small
amount of space, thus helping to understand the tails of the input
distribution. Namely, we focus on two recent state-of-the-art
solutions: t-digest and ReqSketch. While t-digest is a popular compact
summary which works well in a variety of settings, ReqSketch
comes with formal accuracy guarantees at the cost of its
size growing as new observations are inserted. In this work, we
provide insight into which conditions make one preferable to the
other. Namely, we show how to construct inputs for t-digest that
induce an almost arbitrarily large error and demonstrate that it
fails to provide accurate results even on i.i.d. samples from a highly
non-uniform distribution. We propose practical improvements to
ReqSketch, making it faster than t-digest, while its error stays
bounded on any instance. Still, our results confirm that t-digest
remains more accurate on the “non-adversarial” data encountered
in practice.}
  title = {Sequential Random Sampling Revisited: Hidden Shuffle Method},
  booktitle = {AISTATS},
  author = {Michael Shekelyan and Graham Cormode},
  url = {../papers/hiddenshuffle.pdf},
  volume = {130},
  series = {Proceedings of Machine Learning Research},
  publisher = {PMLR},
  link = {},
  pages = {3628-3636},
  year = 2021,
  abstract = {
  Random sampling (without replacement) is ubiquitously employed to obtain a representative subset of the data.
  Unlike common methods, sequential methods report samples in ascending order of index without keeping track of previous samples. This enables lightweight iterators that can jump directly from one sampled position to the next. Previously, sequential methods focused on drawing from the distribution of gap sizes, which requires intricate algorithms that are difficult to validate and can be slow in the worst-case. This can be avoided by a new method, the Hidden Shuffle. The name mirrors the fact that although the algorithm does not resemble shuffling, its correctness can be proven by conceptualising the sampling process as a random shuffle. The Hidden Shuffle algorithm stores just a handful of values, can be implemented in few lines of code, offers strong worst-case guarantees and is shown to be faster than state-of-the-art methods while using comparably few random variates.}
  author = {Graham Cormode and Zohar Karnin and Edo Liberty and Justin Thaler and Pavel Vesel{\'{y}}},
  booktitle = {{ACM} Principles of Database Systems ({PODS})},
  year = 2021,
  title = {Relative Error Streaming Quantiles},
  url = {../papers/reqsketch.pdf},
  link = {},
  abstract = {
  	Approximating ranks, quantiles, and distributions 
  	over streaming data is a central task 
  	in data analysis and monitoring.
  	Given a stream of $n$ items from a data universe $\mathcal{U}$
  	equipped with a total order, the task is to compute a sketch (data structure) of size {poly}$(\log(n), 1/\epsilon)$.
  	Given the sketch and a query item $y \in \mathcal{U}$, one should be able to approximate its rank in the stream, i.e., the number of stream elements smaller than $y$.
  	Most works to date focused on additive $\epsilon n$ error approximation, culminating in the KLL sketch that achieved optimal asymptotic behavior. 
  	This paper investigates \emph{multiplicative} $(1 +/- \epsilon)$-error approximations to the rank. 
  	Practical motivation for multiplicative error stems from demands to understand the tails of distributions,
  	and hence for sketches to be more accurate near extreme values. 
  	The most space-efficient algorithms 
  	due to prior work store either $O(\log(\epsilon^2 n)/\epsilon^2)$ or
  	$O(\log^3(\epsilon n)/\epsilon)$ universe items. 
  	This paper presents a randomized algorithm storing $O(\log^{1.5}(\epsilon n)/\epsilon)$ items.
  	This is within an $O(\sqrt{\log(\epsilon n)})$ factor of optimal. The algorithm does not require 
  	 prior knowledge of the stream length and is fully mergeable, rendering it suitable for parallel and distributed computing environments.}
  author = {Graham Cormode and Charlie Dickens and David Woodruff},
  title = {Subspace exploration: Bounds on Projected Frequency Estimation},
  booktitle = {{ACM} Principles of Database Systems ({PODS})},
  url = {../papers/subspaceexploration.pdf},
  link = {},
  year = 2021,
  abstract = {
    Given an $n \times d$ dimensional dataset $A$, a projection query
    specifies a subset $C \subseteq [d]$ of columns which yields a new
    $n \times |C|$ array.
    We study the space complexity of computing data analysis
    functions over such subspaces, including heavy hitters and norms, when the
    subspaces are revealed only after observing the data.
    We show that this important class of problems is typically hard:
    for many problems, we show $2^{\Omega(d)}$ lower bounds.
    However, we present upper bounds
    which demonstrate space dependency better than $2^d$.
    That is, for $c,c' \in (0,1)$ and a parameter $N=2^d$ an $N^c$-approximation can be obtained
    in space $\min(N^{c'},n)$, showing that it is possible to improve on the na\"{i}ve approach
    of keeping information for all $2^d$ subsets of $d$ columns. 
    Our results are based on careful constructions of instances using coding
    theory and novel combinatorial reductions
    that exhibit such space-approximation tradeoffs.
  author = {Graham Cormode and Minos Garofalakis and Michael Shekelyan},
  title = {Data-Independent Space Partitionings for Summaries},
  booktitle = {{ACM} Principles of Database Systems ({PODS})},
  url = {../papers/dataindependenthist.pdf},
  year = 2021,
  abstract = {
    Histograms are a standard tool in data management for describing
    multidimensional data.
    It is often convenient or even necessary to define {\em data independent histograms}, to
    partition space in advance without observing the data itself.
    Specific motivations arise in managing data when it is not suitable to
    frequently change the boundaries between histogram cells.
    For example, when the data is subject to many
    insertions and deletions; when data is distributed across multiple
    systems; or when producing a privacy-preserving representation of the
    The baseline approach is to consider an equiwidth histogram, 
    i.e., a regular grid over the space.
    However, this is not optimal for the objective of 
    the multidimensional space into (possibly overlapping) bins, such that
    each box can be rebuilt using a set of non-overlapping bins with
    minimal excess (or deficit) of volume.
    Thus, we investigate how to split the space
    into bins and identify novel solutions that offer a good balance of desirable properties. 
    As many data processing tools require a dataset as an input, we propose efficient methods
    how to obtain synthetic point sets that match the histograms over the overlapping bins.
  author = {Graham Cormode and
               Pavel Vesel{\'{y}}},
  title = {A Tight Lower Bound for Comparison-Based Quantile Summaries},
  booktitle = {{ACM} Principles of Database Systems ({PODS})},
  pages = {81--93},
  publisher = {{ACM}},
  year = {2020},
  url = {../papers/quantiles-pods.pdf},
  link = {},
  doi = {10.1145/3375395.3387650}
  title = {Iterative Hessian Sketch in Input Sparsity Time},
  author = {Graham Cormode and Charlie Dickens},
  booktitle = {Proceedings of Beyond First Order Methods in ML (NeurIPS workshop)},
  year = {2019},
  url = {../papers/neurips_2019.pdf},
  poster = {../papers/ihs-poster.pdf},
  link = {},
  abstract = {
  Scalable algorithms to solve optimization and regression tasks even
  approximately, are needed
  to work with large datasets.
  In this paper we study efficient techniques from matrix sketching to
  solve a variety of convex constrained regression problems.
  We adopt ``Iterative Hessian Sketching'' (IHS) and show that the fast
  CountSketch and sparse Johnson-Lindenstrauss Transforms yield
  state-of-the-art accuracy guarantees under IHS, while dramatically
  improving the time cost.
  As a result, we obtain significantly faster algorithms for constrained
  regression, for both sparse and dense inputs.
  Our empirical results show that we can summarize data roughly 100x
  faster for sparse data, and, surprisingly, 10x faster on
  dense data!
Consequently, solutions accurate to within machine precision of the optimal solution can be found much faster than the previous state of the art.}
  title = {Efficient Interactive Proofs for Linear Algebra},
  author = {Graham Cormode and Chris Hickey},
  booktitle = {Proceedings of International Symposium on Algorithms and Computation {(ISAAC)}},
  year = {2019},
  slides = {../slides/EIPLAslides.pdf},
  url = {../papers/EIPLA.pdf},
  abstract = {Motivated by the growth in outsourced data analysis, we describe
methods for verifying basic linear algebra operations performed by a
cloud service without having to recalculate the entire result.
We provide novel protocols in the streaming setting for inner product,
matrix multiplication and vector-matrix-vector multiplication where
the number of rounds of interaction can be adjusted to tradeoff space,
communication, and duration of the protocol. 
Previous work suggests that the costs of these interactive protocols are
optimized by choosing $O(\log n)$ rounds.
However, we argue that we can reduce the number of rounds without incurring a significant time penalty by considering the total end-to-end time, so fewer rounds and larger messages are preferable. 
We confirm this claim with an experimental study that shows that a
constant number of rounds gives the fastest protocol.}
  year = 2019,
  title = {Streaming Algorithms for Bin Packing and Vector Scheduling},
  booktitle = {Workshop on Approximation and Online Algorithms},
  author = {Graham Cormode and Pavel Vesel\'{y}},
  url = {../papers/bin_packing-waoa.pdf},
  abstract = {Problems involving the efficient arrangement of simple objects, as captured by bin
packing and makespan scheduling,
are fundamental tasks in combinatorial optimization.
These are well understood in the traditional online and offline cases,
but have been less well-studied when the volume of the input is truly
massive, and cannot even be read into memory.
This is captured by the streaming model of computation, where the aim
is to approximate the cost of the solution in one pass over the data,
using small space. As a result, streaming algorithms produce
concise input summaries that approximately preserve the optimum value.

We design the first efficient streaming algorithms
for these fundamental problems in combinatorial optimization.
For \textsc{Bin Packing}, we provide a streaming asymptotic $1+\epsilon$-approximation
with $O\sim(1/\epsilon)$ memory,
where $O\sim$ hides logarithmic factors.
Moreover, such a space bound is essentially optimal. 
Our algorithm implies a streaming $d+\epsilon$-approximation for \textsc{Vector Bin Packing}
in $d$ dimensions, running in space $O\sim(\frac{d}{\epsilon})$.
For the related \textsc{Vector Scheduling} problem, we show how to construct
an input summary in space $O\sim(d^2\cdot m / \epsilon^2)$ that preserves
the optimum value up to a factor of $2 - \frac{1}{m} +\epsilon$,
where $m$ is the number of identical machines.}
  year = 2019,
  title = {Towards a Theory of Parameterized Streaming Algorithms},
  author = {Rajesh Chitnis and Graham Cormode},
  booktitle = {International Symposium on Parameterized and Exact Computation},
  url = {../papers/paramstream19.pdf},
  slides = {../slides/ipec-param-streaming-slides.pdf},
  abstract = {
  Parameterized complexity attempts to give a more fine-grained analysis of the complexity of problems: instead of measuring the running time as a function of only the input size, we analyze the running time with respect to additional parameters.
  This approach has proven to be highly successful in
  delineating our understanding of NP-hard problems.
  Given this success with the TIME resource, it seems but natural to use
  this approach for dealing with the SPACE resource.
  First attempts in this direction have considered a few individual problems,
  with some success:
  Fafianie and Kratsch [MFCS'14] and Chitnis et al. [SODA'15]
  introduced the notions of streaming kernels and parameterized streaming algorithms respectively.
  For example, the latter shows how to refine the $\Omega(n^2)$ bit lower bound for finding a minimum Vertex Cover (VC) in the streaming setting by designing an algorithm for the parameterized $k$-VC problem which uses $O(k^{2}\log n)$ bits.
  In this paper, we initiate a systematic study of graph problems from
  the paradigm of parameterized streaming algorithms.
  We first define a natural hierarchy of space complexity classes
  of FPS, SubPS, SemiPS, SupPS and BPS, and then obtain tight
  classifications for several well-studied graph problems such as
  Longest Path, Feedback Vertex Set, Dominating Set, Girth, Treewidth, etc. into this hierarchy.
  On the algorithmic side, our parameterized streaming algorithms use
  techniques from the FPT world such as bidimensionality, iterative compression and
  bounded-depth search trees. On the hardness side, we obtain lower bounds for the parameterized streaming complexity of various problems via novel reductions from problems in communication complexity.
  We also show a general (unconditional) lower bound for space complexity of parameterized streaming algorithms for a large class of problems inspired by the recently developed frameworks for showing (conditional) kernelization lower bounds.
  Parameterized algorithms and streaming algorithms are approaches to
  cope with TIME and SPACE intractability respectively. It is our hope
  that this work on parameterized streaming algorithms leads to two-way
  flow of ideas between these two previously separated areas of theoretical computer science.}
  title = {Answering Range Queries Under Local Differential Privacy},
  author = {Graham Cormode and Tejas Kulkarni and Divesh Srivastava},
  year = 2019,
  booktitle = {International Conference on 
                  Very Large Data Bases ({VLDB})},
  url = {../papers/ldprange.pdf},
  abstract = {Counting the fraction of a population having an input within a
specified interval i.e. a \emph{range query}, is a fundamental data
analysis primitive.
Range queries can also be used to compute other core statistics
such as \emph{quantiles}, and to build prediction models.
However, frequently the data is subject to privacy concerns when it
is drawn from individuals, and relates for example to their financial, health,
religious or political status. 
In this paper, we introduce and analyze methods to support range
queries under the local variant of differential
privacy, an emerging standard for privacy-preserving
data analysis.

The local model requires that each user releases a noisy view of her
private data under a privacy guarantee. 
While many works address the problem of range queries in the
trusted aggregator setting, this problem has not been
addressed specifically under untrusted aggregation (local DP) model
even though many primitives have been developed recently for 
estimating a discrete distribution.
We describe and analyze two classes of approaches for range queries,
based on hierarchical histograms and the Haar wavelet transform.
We show that both have strong theoretical accuracy guarantees on
In practice, both methods are fast and require minimal computation and
communication resources.
Our experiments show that the wavelet approach is most accurate in high privacy settings, while
the hierarchical approach dominates for weaker privacy requirements. }
  title = {Independent Sets in Vertex-Arrival Streams},
  author = {Graham Cormode and Jacques Dark and Christian Konrad},
  year = 2019,
  booktitle = {International Colloquium on Automata, 
                  Languages and Programming ({ICALP})},
  url = {../papers/vertexarrival.pdf},
  abstract = {    We consider the maximal and maximum independent set problems in
    three models of graph streams:

    In the edge model we see a stream of edges which
    collectively define a graph; this model is well-studied for
    a variety of problems. We show that the space complexity for
    a one-pass streaming algorithm to find a maximal independent set
    is quadratic (i.e. we must store all edges). We further
    show that it is not much easier if we only
    require approximate maximality. This contrasts strongly with the
    other two vertex-based models, where one can greedily find an exact solution in only the space needed to store the independent set.

    In the ``explicit'' vertex model, the input stream is a
    sequence of vertices making up the graph. Every vertex
    arrives along with its incident edges that connect to previously
    arrived vertices. Various graph problems require substantially
    less space to solve in this setting than in edge-arrival streams. We
    show that every one-pass c-approximation streaming algorithm for
    maximum independent set (\textsc{MIS}) on explicit vertex streams requires $\Omega(\frac{n^2}{c^6})$ bits of space, 
    where $n$ is the number of vertices of the input graph. It is
    already known that ${\Theta}\sim(\frac{n^2}{c^2})$ bits of space 
    are necessary and sufficient in the edge arrival model (Halld\'{o}rsson \emph{et al.} 2012), thus the \textsc{MIS} problem is not significantly 
    easier to solve under the explicit vertex arrival order assumption. Our result is proved via a reduction from a new 
    multi-party communication problem closely related to pointer jumping.

    In the ``implicit'' vertex model, the input stream consists of a sequence of objects, one per vertex. 
    The algorithm is equipped with a function that maps pairs of objects to the presence or absence of edges, 
    thus defining the graph. This model captures, for example, geometric intersection graphs such as unit disc graphs. 
    Our final set of results consists of several improved upper and lower bounds for interval and square intersection graphs, in both 
    explicit and implicit streams. In particular, we show a gap
    between the hardness of the explicit and implicit vertex models
    for interval graphs.
  author = {Graham Cormode and Charlie Dickens and David P. Woodruff},
  title = {Leveraging Well-Conditioned Bases: Streaming and Distributed Summaries in Minkowski $p$-Norms},
  booktitle = {International Conference on Machine Learning, ({ICML})},
  year = 2018,
  url = {../papers/wcb.pdf},
  poster = {../slides/leveragingwcb-poster.pdf},
  abstract = {
    Work on approximate linear algebra
    has led to efficient distributed and streaming
    algorithms for
    problems such as approximate matrix multiplication, low rank approximation,
    and regression, primarily for the Euclidean norm $l_2$.
    We study other
    $l_p$ norms, which are more robust for $p < 2$, and can be used
    to find outliers for $p > 2$.
    Unlike previous algorithms for such norms,
  we give algorithms that are (1) deterministic, (2) work simultaneously
  for every $p \geq 1$, including $p$ infinite, and (3) can be
    implemented in both
    distributed and streaming environments. We apply our results to $l_p$-regression,
    entrywise $l_1$-low rank approximation,
    and approximate matrix multiplication.
  author = {Graham Cormode and Tejas Kulkarni and Divesh Srivastava},
  title = {Marginal Release Under Local Differential Privacy},
  booktitle = {{ACM} {SIGMOD} International 
                    Conference on Management of Data ({SIGMOD})},
  url = {../papers/sigmod18.pdf},
  slides = {../slides/marginals-sigmod.pdf},
  poster = {../slides/marginals-sigmod-poster.pdf},
  year = 2018,
  abstract = {Many analysis and machine learning tasks require the availability of
marginal statistics on multidimensional datasets while providing
strong privacy guarantees for the data subjects.
Applications for these statistics range from finding correlations
in the data to fitting sophisticated prediction models.
In this paper, we provide a set of algorithms for materializing marginal
statistics under the strong model of local differential privacy.
We prove the first tight theoretical bounds on the accuracy of marginals
compiled under each approach, perform empirical evaluation to
confirm these bounds, and evaluate them for tasks such as modeling and
correlation testing.
Our results show that releasing information based on (local) Fourier
transformations of the input is preferable to alternatives based
directly on (local) marginals.}
  author = {Graham Cormode and Christopher Hickey},
  title = {You Can Check Others' Work More Quickly Than Doing It Yourself},
  booktitle = {International Conference on Data Engineering ({ICDE})},
  year = 2018,
  url = {../papers/lightning.pdf},
  slides = {../slides/sip2018icde.pdf},
  abstract = {Much of computer science involves problems where
it is considered to be easier to check that an answer is correct than
to find a correct answer (the complexity class NP). In this talk,
we outline results that apply this notion to checking outsourced
computations for data analytics.}
  author = {Graham Cormode and Jacques Dark and Christian Konrad},
  title = {Approximating the Caro-Wei bound for Independent Sets in Graph Streams},
  url = {../papers/cdk17.pdf},
  year = 2018,
  booktitle = {International Symposium on Combinatorial Optimization},
  abstract = {
  The Caro-Wei bound states that every graph $G=(V, E)$ contains an independent set of size at 
  least $\beta(G) := \sigma_{v \in V} \frac{1}{{deg}_G(v) + 1}$, where ${deg}_G(v)$ denotes the degree of vertex $v$.
  Halld\'{o}rsson et al. gave a randomized one-pass streaming algorithm that computes an independent 
  set of expected size $\beta(G)$ using $O(n \log n)$ space. In this paper, we give streaming algorithms and a 
  lower bound for approximating the Caro-Wei bound itself.
  In the edge arrival model, we present a one-pass $c$-approximation
  streaming algorithm that uses $O({{d'} \log (n) /c^2})$ space, 
  where ${d'}$ is the average degree of $G$. We further prove
  that space $\Omega({{d'} / c^2})$ is necessary, rendering
  our algorithm almost optimal. This lower bound holds even 
  in the {\em vertex arrival model}, where vertices arrive one by one together with their incident edges that connect to vertices
  that have previously arrived. 
  In order to obtain a poly-logarithmic space algorithm even for graphs with arbitrarily large average degree, we employ
  an alternative notion of approximation: We give a one-pass streaming algorithm with space $O(\log^3 n)$ in the vertex arrival model 
  that outputs a value that is at most a logarithmic factor below the true value of $\beta$ and no more than the maximum independent set size.
  author = {Yu Zhang and Srikanta Tirthapura and Graham Cormode},
  title = {Learning Graphical Models from a Distributed Stream},
  booktitle = {International Conference on Data Engineering ({ICDE})},
  year = 2018,
  url = {../papers/distributedBayes.pdf},
  poster = {../slides/distbayesposter.pdf},
  abstract = {
 A current challenge for data management systems is to support the
 construction and maintenance of machine learning models over data that
 is large, multi-dimensional, and evolving.
 While systems that could support these tasks are emerging, the need to scale
 to distributed, streaming data requires new models and algorithms.
 In this setting, as well as computational scalability and model
 accuracy, we also need to minimize the amount of communication between
 distributed processors, which is the chief component of latency.
 We study Bayesian networks, the workhorse of graphical models, 
 and present a communication-efficient method for continuously learning 
 and maintaining a Bayesian network model over data that is 
 arriving as a distributed stream partitioned across multiple processors. 
 We show a strategy for maintaining model parameters 
 that leads to an exponential reduction in communication when 
 compared with baseline approaches to maintain the exact MLE (maximum
 likelihood estimation).
 Meanwhile, our strategy provides similar prediction errors for the
 target distribution and for classification tasks.}
  author = {Graham Cormode and Jacques Dark},
  title = {Fast Sketch-based Recovery of Correlation Outliers},
  booktitle = {International Conference on Database Theory},
  year = 2018,
  url = {../papers/correlationoutliers.pdf},
  abstract = {
    Many data sources can be interpreted as time-series, and a key problem is to identify which pairs out of a large collection of signals are highly correlated. We expect that there will be few, large, interesting correlations, while most signal pairs do not have any strong correlation. We abstract this as the problem of identifying the highly correlated pairs in a collection of $n$ mostly pairwise uncorrelated random variables, where observations of the variables arrives as a stream. Dimensionality reduction can remove dependence on the number of observations, but further techniques are required to tame the quadratic (in $n$) cost of a search through all possible pairs. 
    We develop a new algorithm for rapidly finding large correlations based on sketch techniques with an added twist: we quickly generate sketches of random combinations of signals, and use these in concert with ideas from coding theory to decode the identity of correlated pairs. We prove correctness and compare performance and effectiveness with the best LSH (locality sensitive hashing) based approach.}
  author = {Graham Cormode and Chris Hickey},
  title = {Cheap Checking for Cloud Computing: 
            Statistical Analysis via Annotated Data Streams},
  booktitle = {AISTATS},
  year = 2018,
  poster = {../slides/cheapcheckingposter.pdf},
  url = {../papers/cheapchecking.pdf},
  abstract = {As the popularity of outsourced computation increases, questions of 
accuracy and trust between the client and the cloud computing services
become ever more relevant.
Our work aims to provide fast and practical methods to verify analysis
of large data sets, where the client's computation and memory and
costs are kept to a minimum.
Our verification protocols are based on defining ``proofs'' which are easy
to create and check.
These add only a small overhead to reporting the result of the
computation itself. 
We build up a series of protocols for elementary statistical methods, to create more complex protocols for Ordinary Least Squares, Principal Component Analysis and Linear Discriminant Analysis. 
We show that these are very efficient in practice. }
  author = {Graham Cormode and Tejas Kulkarni and Divesh Srivastava},
  title = {Constrained Private Mechanisms for Count Data},
  booktitle = {International Conference on Data Engineering ({ICDE})},
  year = 2018,
  url = {../papers/constrainedmechanisms.pdf},
  slides = {../slides/icde18constrained.pdf},
  abstract = {
    Concern about how to aggregate sensitive user data without compromising
    individual privacy is a major barrier to greater availability of
    The model of differential privacy has emerged as an accepted model to
    release sensitive information while giving a statistical guarantee for
    Many different algorithms are possible to address different target
    We focus on the core problem of count queries, and seek to design
    {\em mechanisms} to release data associated with a group of $n$
    Prior work has focused on designing mechanisms by raw optimization of a
    loss function, without regard to the consequences on the results. 
    This can leads to mechanisms with undesirable properties, such as
    never reporting some outputs (gaps), and overreporting others
    We tame these pathological behaviors by introducing a set of desirable properties that
    mechanisms can obey.
    Any combination of these can be satisfied by solving a linear program (LP)
    which minimizes a cost function, with constraints enforcing the properties.
    We focus on a particular cost function, and 
    provide explicit constructions that are optimal for certain
    combinations of properties, and show a closed form for their cost.
    In the end, there are only a handful of distinct optimal mechanisms to choose
    between: one is the well-known (truncated) geometric mechanism; the
    second a novel mechanism that we introduce here, and the remainder are
    found as the solution to particular LPs.
    These all avoid the bad behaviors we identify. 
    We demonstrate in a set of experiments on real and
    synthetic data which is preferable in practice, for different
combinations of data distributions, constraints, and privacy parameters. }
  author = {Graham Cormode and Hossein Jowhari and Morteza Monemizadeh and S. Muthukrishnan},
  title = {Streaming Algorithms for Matching Size Estimation in Sparse Graphs},
  booktitle = {European Symposium on Algorithms},
  year = 2017,
  url = {../papers/matching-esa.pdf},
  slides = {../slides/sparse-esa.pdf},
  pubsite = {},
  abstract = {Estimating the size of the maximum matching is a canonical problem in
graph analysis, and one that has attracted extensive study over a
range of different computational models. 
We present improved streaming algorithms for approximating the size of maximum matching with sparse (bounded arboricity) graphs. 

* {\em (Insert-Only Streams)}
We present a one-pass algorithm that takes  $O(c\log n)$ space and approximates the size of the  maximum matching in 
graphs with arboricity $c$ within a factor of $O(c)$. This improves
significantly upon the state-of-the-art ${O}\sim(c n^{2/3})$-space
streaming algorithms, and is the first poly-logarithmic space
algorithm for this problem. 

* {\em (Dynamic Streams)}
Given a dynamic graph stream (i.e., inserts and deletes) of edges of an underlying $c$-bounded arboricity graph,  
we present an one-pass algorithm that uses  space ${O}\sim(c^{10/3}n^{2/3})$ 
and returns an $O(c)$-estimator for the size of the maximum matching on the condition that the number
edge deletions in the stream is bounded by $O(c n)$. 
For this class of inputs, our algorithm improves the state-of-the-art ${O}\sim(c n^{4/5})$-space algorithms, where the ${O}\sim(.)$ notation hides logarithmic in $n$ dependencies.

In contrast to prior work, our results take more advantage of the streaming access to the input and characterize the matching size 
based on the ordering of the edges in the stream in addition to the degree
distributions and structural properties of the sparse graphs. 
  author = {Zach Jorgensen and
               Ting Yu and
               Graham Cormode},
  title = {Publishing Attributed Social Graphs with Formal Privacy Guarantees},
  booktitle = {{ACM} {SIGMOD} International 
                    Conference on Management of Data ({SIGMOD})},
  pages = {107--122},
  year = {2016},
  url = {../papers/attributedgraph.pdf},
  slides = {../slides/graphanon-sigmod.pdf},
  poster = {../slides/graphanon16-poster.pdf},
  link = {},
  abstract = {Many data analysis tasks rely on the abstraction of a graph to represent
relations between entities, with attributes on the nodes and edges.  
Since the relationships encoded are often sensitive, we seek effective
ways to release representative graphs which nevertheless protect the
privacy of the data subjects.
Prior work on this topic has focused primarily on the graph
structure in isolation, and has not provided ways to handle richer
graphs with correlated attributes.

We introduce an approach to release such graphs under the strong guarantee
of differential privacy.
We adapt existing graph models, and introduce a new one, and show how
to augment them with meaningful privacy.
This provides a complete workflow, where the input is a sensitive
graph, and the output is a realistic synthetic graph.
Our experimental study demonstrates that our process produces useful,
accurate attributed graphs. }
  title = {Kernelization via Sampling with Applications to Dynamic Graph Streams},
  author = {Rajesh Chitnis and Graham Cormode and Hossein Esfandiari and MohammadTaghi Hajiaghayi and Andrew McGregor and Morteza Monemizadeh and Sofya Vorotnikova},
  booktitle = {{ACM}-{SIAM} Symposium on Discrete Algorithms ({SODA})},
  year = 2016,
  url = {../papers/vcsampling.pdf},
  link = {},
  abstract = {In this paper we present a simple but powerful subgraph sampling primitive that is applicable in a variety of computational models including dynamic graph streams (where the input graph is defined by a sequence of edge/hyperedge insertions and deletions) and distributed systems such as MapReduce. In the case of dynamic graph streams, we use this primitive to prove the following results:

{\em Matching:} Our main result for matchings is that there exists an ${O}\sim(k^2)$ space algorithm that returns the edges of a maximum matching on the assumption the cardinality is at most $k$. The best previous algorithm used ${O}\sim(kn)$ space where $n$ is the number of vertices in the graph
and we prove our result is optimal up to logarithmic factors. Our algorithm has ${O}\sim(1)$ update time. 
We also show that there exists an ${O}\sim(n^2/\alpha^3)$ space
algorithm that returns an $\alpha$-approximation for matchings of
arbitrary size. In independent work, Assadi et al.~(arXiv 2015) proved
this is optimal and provided an alternative algorithm. We  generalize our  exact and approximate algorithms to weighted matching. 
While there has been a substantial amount of work on approximate matching in insert-only graph streams, these are the first non-trivial results  in the dynamic setting. 

* {\em Vertex Cover and Hitting Set:} There exists an ${O}\sim(k^d)$ space algorithm that solves the minimum hitting set problem where $d$ is the cardinality of the input sets and $k$ is an upper bound on the size of the minimum hitting set. We prove this is  optimal up to logarithmic factors. Our algorithm has ${O}\sim(1)$ update time. The case $d=2$ corresponds to minimum vertex cover.

Finally, we consider a larger family of parameterized problems (including $b$-matching, disjoint paths, vertex coloring among others) for which our subgraph sampling primitive yields fast, small-space dynamic graph stream algorithms. We then show lower bounds for natural problems outside this family. 
  author = {Kook Jin Ahn and
               Graham Cormode and
               Sudipto Guha and
               Andrew McGregor and
               Anthony Wirth},
  title = {Correlation Clustering in Data Streams},
  booktitle = {International Conference on Machine Learning, ({ICML})},
  pages = {2237--2246},
  year = {2015},
  pubsite = {},
  url = {../papers/corrclustering.pdf},
  abstract = {
  In this paper, we address the
  problem of \emph{correlation clustering} in the dynamic data stream
  The stream consists of updates to the edge weights of a graph
  on~$n$ nodes and the goal is to find a node-partition such that the
  end-points of negative-weight edges are typically in different
  clusters whereas the end-points of positive-weight edges are typically
  in the same cluster. We present polynomial-time, $O(n\cdot {polylog}
  n)$-space approximation algorithms for natural problems that arise.
  We first develop data structures based on linear sketches that allow
  the ``quality'' of a given node-partition to be measured. We then 
  combine these data structures with convex programming and
  sampling techniques to solve the relevant approximation problem.
  However the standard LP and SDP formulations are not obviously
  solvable in $O(n\cdot {polylog} n)$-space. Our work presents
  space-efficient algorithms for the convex programming required, as well as
  approaches to reduce the adaptivity of the sampling. Note that the improved space
  and running-time bounds achieved from streaming algorithms
  are also useful for offline settings such as MapReduce models.}
  author = {Xi He and
               Graham Cormode and
               Ashwin Machanavajjhala and
               Cecilia M. Procopiuc and
               Divesh Srivastava},
  title = {{DPT:} Differentially Private Trajectory Synthesis Using Hierarchical
               Reference Systems},
  booktitle = {Proceedings of the {VLDB} Endowment},
  volume = {8},
  pages = {1154--1165},
  year = {2015},
  url = {../papers/dpt.pdf},
  pubsite = {},
  abstract = {GPS-enabled devices are now ubiquitous, from airplanes and cars to 
smartphones and wearable technology.
This has resulted in a wealth of data about the movements of 
individuals and populations, which can be analyzed for useful information 
to aid in city and traffic planning, disaster preparedness and so on. 
However, the places that people go can disclose extremely sensitive 
information about them, and thus their use needs to be filtered 
through privacy preserving mechanisms. 
This turns out to be a highly challenging task: raw trajectories are 
highly detailed, and typically no pair is alike. 
Previous attempts fail either to provide adequate privacy protection, 
or to remain sufficiently faithful to the original behavior.

This paper presents DPT, a system to synthesize mobility data based 
on raw GPS trajectories of individuals while ensuring strong privacy 
protection in the form of $\epsilon$-differential privacy.
DPT makes a number of novel modeling and algorithmic contributions
including (i) discretization of raw trajectories using hierarchical 
reference systems (at multiple resolutions) to capture individual 
movements at differing speeds, (ii) adaptive mechanisms to select a 
small set of reference systems and construct prefix tree counts
privately, and (iii) use of direction-weighted sampling for improved utility. 
While there have been prior attempts to solve the subproblems required
to generate synthetic trajectories, to the best of our knowledge, 
ours is the first system that provides an end-to-end solution. 
We show the efficacy of our synthetic trajectory generation system 
using an extensive empirical evaluation.}
  author = {Rajesh Chitnis and
               Graham Cormode and
               Hossein Esfandiari and
               MohammadTaghi Hajiaghayi and
               Morteza Monemizadeh},
  title = {New Streaming Algorithms for Parameterized Maximal
               Matching {\&} Beyond},
  booktitle = {Symposium on Parallelism in Algorithms},
  pages = {56--58},
  year = {2015},
  url = {../papers/maxmatching.pdf},
  abstract = {
   Very recently at SODA'15, we studied maximal matching via the framework of
   \emph{parameterized streaming}, where we sought solutions
   under the promise that no maximal matching exceeds $k$ in size.
   In this paper, we revisit this problem and provide a much simpler algorithm for this problem.
   We are also able to apply the same technique to the \emph{Point Line Cover} problem.}
  author = {Jun Zhang and Graham Cormode and Magda Procopiuc and Divesh Srivastava and Xiaokui Xiao},
  title = {Private Release of Graph Statistics using Ladder Functions},
  booktitle = {{ACM} {SIGMOD} International 
                    Conference on Management of Data ({SIGMOD})},
  year = 2015,
  url = {../papers/staircase.pdf},
  abstract = {
  Protecting the privacy of individuals in graph structured data while
  making accurate versions of the data available is one of the most
  challenging problems in data privacy.
  Most efforts to date to perform this data
  release end up mired in complexity, overwhelm the signal with noise, and
  are not effective for use in practice.
  In this paper, we introduce a new method which guarantees differential
  privacy.  It specifies a probability distribution over possible outputs
  that is carefully defined to maximize the utility for the given input,
  while still providing the required privacy level.  The distribution is
  designed to form a `ladder', so that each output achieves the highest
  `rung' (maximum probability) compared to less preferable outputs.  We
  show how our ladder framework can be applied to problems of counting the
  number of occurrences of subgraphs, a vital objective in graph analysis,
  and give algorithms whose cost is comparable to that of computing the
  count exactly.  Our experimental study confirms that our method
  outperforms existing methods for counting triangles and stars in
  terms of accuracy, and provides solutions for some problems for which
  no effective method was previously known.
  The results of our algorithms can be used to estimate the parameters of
  suitable graph models, allowing synthetic graphs to be sampled.
  author = {Amit Chakrabarti and Graham Cormode and Andrew McGregor and Justin Thaler and Suresh Venkatasubramanian},
  year = 2015,
  booktitle = {Computational Complexity Conference},
  title = {Verifiable Stream Computation and {Arthur}-{Merlin} Communication},
  url = {../papers/oipccc.pdf},
  abstract = {
In the setting of streaming interactive proofs (SIPs), a client (verifier)
needs to compute a given function on a massive stream of data, arriving
online, but is unable to store even a small fraction of the data. It
outsources the processing to a third party service (prover), but is unwilling
to blindly trust answers returned by this service. Thus, the service cannot
simply supply the desired answer; it must {\em convince} the verifier of its
correctness via a short interaction after the stream has been seen.

In this work we study ``barely interactive'' SIPs. Specifically, we show that
two or three rounds of interaction suffice to solve several query
problems---including Index, Median, Nearest Neighbor Search, Pattern Matching,
and Range Counting---with polylogarithmic space and communication costs.  Such
efficiency with $O(1)$ rounds of interaction was thought to be impossible
based on previous work.

On the other hand, we initiate a formal study of the limitations of
constant-round SIPs by introducing a new hierarchy of communication models
called Online Interactive Proofs (OIPs). The online nature of these models is
analogous to the streaming restriction placed upon the verifier in an SIP. We
give upper and lower bounds that (1) characterize, up to quadratic blowups,
every finite level of the OIP hierarchy in terms of other well-known
communication complexity classes, (2) separate the first four levels of the
hierarchy, and (3) reveal that the hierarchy collapses to the fourth level.
Our study of OIPs reveals marked contrasts and some parallels with the classic
Turing Machine theory of interactive proofs, establishes limits on the power
of existing techniques for developing constant-round SIPs, and provides a new
characterization of (non-online) Arthur--Merlin communication in terms of an
online model.
  author = {Zach Jorgensen and Ting Yu and Graham Cormode},
  title = {Conservative or Liberal? Personalized Differential Privacy},
  booktitle = {International Conference on Data Engineering ({ICDE})},
  year = 2015,
  url = {../papers/pdp.pdf},
  abstract = {
  Differential privacy is widely accepted as a powerful framework for 
  providing strong, formal privacy guarantees for aggregate data analysis. 
  A limitation of the model is that the same level of privacy protection 
  is afforded for all individuals. However, it is common that the data 
  subjects have quite different expectations regarding the acceptable 
  level of privacy for their data. Consequently, differential privacy may 
  lead to insufficient privacy protection for some users, while 
  over-protecting others. 
  We argue that by accepting that not all users require the same level of 
  privacy, a higher level of utility can often be attained by not 
  providing excess privacy to those who do not want it. We 
  propose a new privacy definition called \emph{personalized differential 
  privacy} (PDP), a generalization of differential privacy in which users 
  specify a personal privacy requirement for their data. We then introduce 
  several novel mechanisms for achieving PDP. Our primary mechanism is a 
  general one that automatically converts any existing 
  differentially private algorithm into one that satisfies PDP. We also 
  present a more direct approach for achieving PDP, 
  inspired by the well-known exponential mechanism. We demonstrate 
  our framework through extensive experiments on real and synthetic data. }
  title = {Parameterized Streaming: Maximal Matching and Vertex Cover},
  booktitle = {{ACM}-{SIAM} Symposium on Discrete Algorithms ({SODA})},
  year = 2015,
  author = {Rajesh Chitnis and Graham Cormode and MohammadTaghi Hajiaghayi and Morteza Monemizadeh},
  url = {../papers/paramstreaming.pdf},
  abstract = {
    As graphs continue to grow in size, we seek ways to effectively process such data at scale.
    The model of streaming graph processing, in which a compact summary is maintained as each
    edge insertion/deletion is observed, is an attractive one.
    However, few results are known for optimization problems over such dynamic graph streams.
    In this paper, we introduce a new approach to handling graph streams,
    by instead seeking solutions for the parameterized versions of these problems.
    Here, we are given a parameter $k$ and the objective is to decide
    whether there is a solution bounded by $k$.
    By combining kernelization techniques with randomized sketch structures,
    we obtain the first streaming algorithms for the parameterized
    versions of Maximal Matching and Vertex Cover.
    We consider various models for a graph stream on $n$ nodes:
    the insertion-only model where the edges can only be added,
    and the dynamic model where edges can be both inserted and deleted.
    More formally, we show the following results:
    In the insertion only model, there is a one-pass deterministic algorithm
    for the parameterized Vertex Cover problem which computes a sketch
    using $O\sim(k^{2})$ space [
$O\sim(f(k))=O(f(k)\cdot\log^{O(1)} m)$, where $m$ is the number of edges.]
      such that at each
      timestamp in time $\sim{O}(2^{k})$ it can either extract a solution of
      size at most $k$ for the current instance, or report that no such solution exists.
    We also show a tight lower bound of $\Omega(k^2)$ for the space
    complexity of any (randomized) streaming algorithms for
    the parameterized Vertex Cover, even in the insertion-only model.
    $*$ In the dynamic model, and under the \emph{promise} that at each
    timestamp there is a maximal matching of size at most $k$, there is a one-pass
    $O\sim(k^2)$-space (sketch-based) dynamic algorithm that maintains a maximal matching
    with worst-case update time [The time to update the current maximal matching upon an insertion or deletion.]
    $O\sim(k^2)$. This algorithm partially solves Open Problem $64$ from the List of open problems in sublinear algorithms.
    An application of this dynamic matching algorithm is a one-pass
    $O\sim(k^{2})$-space streaming algorithm for the parameterized Vertex Cover problem that in time $O\sim(2^{k})$
    extracts a solution for the final instance with probability $1-\delta/{n^{O(1)}}$, where $\delta<1$.
    To the best of our knowledge, this is the first graph streaming algorithm that combines linear
    sketching with sequential operations that depend on the graph at the current time.
    $*$ In the dynamic model without any promise, there is a one-pass
    randomized algorithm for the parameterized Vertex Cover problem
    which computes a sketch using $O\sim(nk)$ space such that in time $O\sim(nk+2^{k})$ it can either extract a solution of size
    at most $k$ for the final instance, or report that no such solution exists.
  title = {PrivBayes: Private Data Release via Bayesian Networks},
  author = {Graham Cormode and Magda Procopiuc and Divesh Srivastava and Xiaokui Xiao and Jun Zhang},
  year = 2014,
  booktitle = {{ACM} {SIGMOD} International 
                    Conference on Management of Data ({SIGMOD})},
  url = {../papers/PrivBayes.pdf},
  slides = {../slides/PrivBayes.pdf},
  abstract = {
    Privacy-preserving data publishing is an important problem that has
    been the focus of extensive study.
    The state-of-the-art goal for this problem is differential privacy,
    which offers a strong degree of privacy protection without making
    restrictive assumptions about the adversary.
    Existing techniques using differential privacy, however, cannot
    effectively handle the publication of high-dimensional data.
    In particular, when the input dataset contains a large number of
    attributes, existing methods require injecting a prohibitive amount of
    noise compared to the signal in the data, which renders the
    published data next to useless.
    To address the deficiency of the existing methods, this paper presents
    PrivBayes, a differentially private method for releasing
    high-dimensional data.
    Given a dataset $D$, PrivBayes first constructs a Bayesian network
    $\mathcal{N}$, which (i) provides a succinct model of the correlations
    among the attributes in $D$ and (ii) allows us to approximate the
    distribution of data in $D$ using a set $\mathcal{P}$ of
    low-dimensional marginals of $D$.
    After that, PrivBayes injects noise
    into each marginal in $\mathcal{P}$ to ensure differential privacy,
    and then uses the noisy marginals and the Bayesian network to
    construct an approximation of the data distribution in $D$. Finally,
    PrivBayes samples tuples from the approximate distribution to
    construct a synthetic dataset, and then releases the synthetic
    data. Intuitively, PrivBayes circumvents the curse of dimensionality,
    as it injects noise into the low-dimensional marginals in
    $\mathcal{P}$ instead of the high-dimensional dataset $D$.
    Private construction of Bayesian networks turns out to be
    significantly challenging, and we introduce a novel approach that uses
    a surrogate function for mutual information to build the model more
    accurately. We experimentally evaluate PrivBayes on real data, and demonstrate that it significantly outperforms existing solutions in terms of accuracy.  
  title = {Modeling Collaboration in Academia: A Game Theoretic Approach},
  author = {Qiang Ma and S. Muthukrishnan and Brian Thompson and Graham Cormode},
  booktitle = {WWW Workshop on Big Scholarly Data},
  year = 2014,
  url = {../papers/hgame.pdf},
  abstract = {
    In this work, we aim to understand the mechanisms driving academic collaboration. We begin by building a model for how researchers split their effort between multiple papers, and how collaboration affects the number of citations a paper receives, supported by observations from a large real-world publication and citation dataset, which we call the {\em h-Reinvestment model}. Using tools from the field of Game Theory, we study
    researchers' collaborative behavior over time under this model, with the premise that each researcher wants to maximize his or her academic success. We find analytically that there is a strong incentive to collaborate rather than work in isolation, and that studying collaborative behavior through a game-theoretic lens is a promising approach to help us better understand the nature and dynamics of academic collaboration.}
  author = {Graham Cormode and S. Muthukrishnan and Jinyun Yan},
  title = {People Like Us: Mining Scholarly Data for Comparable Researchers},
  booktitle = {WWW Workshop on Big Scholarly Data},
  year = 2014,
  url = {../papers/comparableppl.pdf},
  abstract = {
    We present the problem of finding comparable researchers for any given researcher. 
    This problem has many motivations.
    Firstly, know thyself.
    The answers of where we stand among research community and who we are
    most alike may not be easily found by existing evaluations of ones' research 
    mainly based on citation counts.
    Secondly, there are many situations where one needs to find comparable researchers e.g., for reviewing peers, constructing programming committees or
    compiling teams for grants. %, etc. 
    It is often done through an ad hoc and informal basis. 
    Utilizing the large scale scholarly data accessible on the web, we address the problem of 
    automatically finding comparable researchers.
    We propose a standard to quantify the quality of research output, via the quality of publishing venues. 
    We represent a researcher as a sequence of her publication records, and develop a framework of comparison of researchers by sequence matching. 
    Several variations of comparisons are considered including matching by
    quality of publication venue and research topics, and performing prefix matching.  
    We evaluate our methods on a large corpus and
    demonstrate the effectiveness  of our methods through examples.
    In the end, we identify several promising directions for further work. }
  author = {Amit Chakrabarti and Graham Cormode and Navin Goyal and Justin Thaler},
  booktitle = {{ACM}-{SIAM} Symposium on Discrete Algorithms ({SODA})},
  year = 2014,
  title = {Annotations for Sparse Data Streams},
  url = {../papers/sparseannotations.pdf},
  link = {},
  abstract = {
  Motivated by the surging popularity of commercial cloud computing services, a
  number of recent works have studied \emph{annotated data streams} and variants
  In this setting, a computationally weak \emph{verifier} (cloud user), lacking
  the resources to store and manipulate his massive input locally, accesses a
  powerful but untrusted \emph{prover} (cloud service).  The verifier must work
  within the restrictive data streaming paradigm.  The prover, who can
  \emph{annotate} the data stream as it is read, must not just supply the final
  answer but also convince the verifier of its correctness. Ideally, both the amount
  of annotation from the prover and the space used by the verifier should be
  sublinear in the relevant input size parameters.
  A rich theory of such algorithms---which we call \emph{schemes}---has started
  to emerge. Prior work has shown how to leverage the prover's power to
  efficiently solve problems that have no non-trivial standard data stream
  algorithms.  However, even though optimal schemes are now known for several
  basic problems, such optimality holds only for streams whose length is commensurate with the
  size of the \emph{data universe}.  In contrast, many real-world data sets are
  relatively \emph{sparse}, including graphs that contain only $o(n^2)$ edges, and IP
  traffic streams that contain much fewer than the total number of possible IP
  addresses, $2^{128}$ in IPv6.
  Here we design the first annotation schemes that allow both the annotation and
  the space usage to be sublinear in the total number of stream {\em updates}
  rather than the size of the data universe.  We solve significant problems,
  including variations of INDEX, \textsc{set-disjointness}, and
  \textsc{frequency-moments}, plus several natural problems on graphs. On the
  other hand, we give a new lower bound that, for the first time, rules out
  smooth tradeoffs between annotation and space usage for a specific problem.
  Our technique brings out new nuances in Merlin--Arthur communication
  complexity models, and provides a separation between online versions of the MA
  and AMA models. }
  author = {Graham Cormode and S. Muthukrishnan and Jinyun Yan},
  title = {First Author Advantage: Citation Labeling in Research},
  booktitle = {Proceedings of the Computational Scientometrics: Theory and Applications Workshop at CIKM},
  year = 2013,
  url = {../papers/firstauthor.pdf},
  link = {},
  abstract = {
  Citations among research papers, and the networks they form, are the primary
  object of study in scientometrics. The act of making a citation reflects
  the citer’s knowledge of the related literature, and of the work being cited.
  We aim to gain insight into this process by studying citation keys: user-generated
  labels to identify a cited work. Our main observation is that the
  first listed author is disproportionately represented in such labels, implying
a strong mental bias towards the first author.}
  author = {Graham Cormode and Xi Gong and Cecilia M. Procopiuc and Entong Shen and Divesh Srivastava and Ting Yu},
  booktitle = {{ACM} Conference on Information and Knowledge Management ({CIKM})},
  year = 2013,
  title = {{UMicS}: From Anonymized Data to Usable MicroData},
  url = {../papers/umics.pdf},
  abstract = {
  There is currently a tug-of-war going on surrounding data
  releases. On one side, there are many strong reasons pulling to
  release data to other parties: business factors, freedom of
  information rules, and scientific sharing agreements. On the
  other side, concerns about individual privacy pull back, and
  seek to limit releases. Privacy technologies such as
  differential privacy have been proposed to resolve this
  deadlock, and there has been much study of how to perform
  private data release of data in various forms. The focus of
  such works has been largely on the {\em data owner}: what
  process should they apply to ensure that the released data
  preserves privacy whilst still capturing the input data
  distribution accurately. Almost no attention has been paid to
  the needs of the {\em data user}, who wants to make use of the
  released data within their existing suite of tools and data.
  The difficulty of making use of data releases is a major
  stumbling block for the widespread adoption of data privacy
  In this paper, instead of proposing new privacy mechanisms for
  data publishing, we consider the whole data release process,
  from the data owner to the data user. We lay out a set of
  principles for privacy tool design that highlights the
  requirements for {\em interoperability}, {\em extensibility}
  and {\em scalability}. We put these into practice with {\sf UMicS},
  an end-to-end prototype system to control the release and use
  of private data. An overarching tenet is that it should be
  possible to integrate the released data into the data user's
  systems with the minimum of change and cost. We describe how to
  instantiate {\sf UMicS} in a variety of usage scenarios. We show how
  using data modeling techniques from machine learning can
  improve the utility, in particular when combined with
  background knowledge that the data user may possess. We
  implement {\sf UMicS}, and evaluate it over a selection of data sets
  and release cases. We see that {\sf UMicS} allows for very effective
use of released data, while upholding our privacy principles.}
  title = {Lightweight Authentication of Linear Algebraic Queries on Data Streams},
  author = {Stavros Papadopoulos and Graham Cormode and Antonis Deligiannakis and Minos Garofalakis},
  year = 2013,
  booktitle = {{ACM} {SIGMOD} International 
                    Conference on Management of Data ({SIGMOD})},
  url = {../papers/streamauthsigmod.pdf},
  poster = {../slides/streamauthposter.pdf},
  abstract = {
  We consider a \emph{stream outsourcing} setting, where a data owner
  delegates the management of a set of disjoint data streams to an
  untrusted server. The owner \emph{authenticates} his streams via
  signatures. The server processes continuous queries on the union of
  the streams for clients trusted by the owner. Along with the results,
  the server sends proofs of result correctness derived from the owner's
  signatures, which are easily verifiable by the clients. We design
  novel constructions for a collection of fundamental problems over streams
  represented as \emph{linear algebraic} queries. In particular, our basic schemes
  authenticate \emph{dynamic vector sums} and
  \emph{dot products}, 
  as well as \emph{dynamic matrix products}. 
  These techniques can be adapted for authenticating a wide range
  of important operations in streaming environments, including group by
  queries, joins, in-network aggregation, similarity matching, and event processing.
  All our schemes are very \emph{lightweight}, and offer strong \emph{cryptographic}
  guarantees derived from formal definitions and proofs. 
  We experimentally confirm the practicality of our schemes in the
  performance sensitive streaming setting.   }
  title = {Quantiles over Data Streams: An Experimental Study},
  author = {Lu Wang and Ge Luo and Ke Yi and Graham Cormode},
  year = 2013,
  booktitle = {{ACM} {SIGMOD} International 
                    Conference on Management of Data ({SIGMOD})},
  url = {../papers/nquantiles.pdf},
  abstract = {
    A fundamental problem in data management and analysis is to generate
    descriptions of the distribution of data.  It is most common to give such
    descriptions in terms of the cumulative distribution, which is
    characterized by the quantiles of the data.  The design and engineering
    of efficient methods to find these quantiles has attracted much study,
    especially in the case where the data is described incrementally, and we
    must compute the quantiles in an online, streaming fashion.  Yet while
    such algorithms have proved to be tremendously useful in practice, there
    has been limited formal comparison of the competing methods, and no
    comprehensive study of their performance.  In this paper, we remedy this
    deficit by providing a taxonomy of different methods, and describe
    efficient implementations.  In doing so, we propose and analyze
    variations that have not been explicitly studied before, yet which turn
    out to perform the best.  To illustrate this, we provide detailed
    experimental comparisons demonstrating the tradeoffs between space, time,
  and accuracy for quantile computation.}
  author = {Graham Cormode and Cecilia M. Procopiuc and Entong Shen and Divesh Srivastava and Ting Yu},
  booktitle = {Privacy-Preserving Data Publication and Analysis (PrivDB)},
  title = {Empirical Privacy and Empirical Utility of Anonymized Data},
  att_authors = {gc2602,ds8961,cp2838},
  att_private = {false},
  year = {2013},
  url = {../papers/empiricalpriv.pdf},
  abstract = {Procedures to anonymize data sets are vital for companies, government
agencies and other bodies to meet their obligations to share data
without compromising the privacy of the individuals contributing to it.
Despite much work on this topic, the area has not yet reached stability.
Early models ($k$-anonymity and $l$-diversity) are now thought to
offer insufficient privacy.
Noise-based methods like differential privacy are seen as providing
stronger privacy, but less utility.
However, across all methods
sensitive information of some individuals can often be
inferred with relatively high accuracy.

In this paper, we reverse the idea of a `privacy attack,' by incorporating it
into a measure of privacy.
Hence, we advocate the notion of {\em empirical privacy}, based on the posterior beliefs of an adversary,
and their ability to draw inferences about sensitive values in the
This is not a new model, but rather a unifying view:
it allows us to study several well-known privacy models
which are not directly comparable otherwise.
We also consider an empirical approach to measuring utility, based on
a workload of queries.
Consequently, we are able to place different privacy models including
differential privacy and early syntactic models on
the same scale, and compare their privacy/utility tradeoff.
We learn that, in practice, the difference between differential
privacy and various syntactic models
is less dramatic than previously thought, but there are still clear
domination relations between them.}
  author = {Graham Cormode and Katsiaryna Mirylenka and Themis Palpanas and Divesh Srivastava},
  title = {Finding Interesting Correlations with Conditional Heavy Hitters},
  booktitle = {International Conference on Data Engineering ({ICDE})},
  year = 2013,
  att_authors = {gc2602,ds8961},
  att_private = {false},
  url = {../papers/condhh.pdf},
  poster = {../slides/condhhposter.pdf},
  slides = {../slides/condhhicde.pdf},
  abstract = {The notion of {\em heavy hitters}---items that make up a large
fraction of the population---has been successfully used in a variety
of applications across sensor and RFID monitoring, network data
analysis, event mining, and more.  Yet this notion often fails to
capture the semantics we desire when we observe data in the form of
correlated pairs. Here, we are interested in items that are {\em
  conditionally} frequent: when a particular item is frequent within
the context of its parent item. In this work, we introduce and
formalize the notion of Conditional Heavy Hitters to identify such
items, with applications in network monitoring, and Markov chain modeling.
We introduce several streaming algorithms that allow us to find
conditional heavy hitters efficiently, and provide analytical results.
Different algorithms are successful for different input
characteristics.  We perform an experimental evaluation to demonstrate
the efficacy of our methods, and to study which algorithms are most
suited for different types of data. }
  author = {Graham Cormode and Cecilia M. Procopiuc and Divesh Srivastava and Grigory Yaroslavtsev},
  title = {Accurate and Efficient Private Release of Datacubes and Contingency Tables},
  booktitle = {International Conference on Data Engineering ({ICDE})},
  year = 2013,
  att_authors = {gc2602,ds8961,cp2838},
  att_private = {false},
  url = {../papers/privcube.pdf},
  abstract = {A central problem in releasing aggregate information about
sensitive data is to do so accurately
while providing a privacy guarantee on the output.
Recent work focuses on the class of {\em linear queries}, which include basic
counting queries, data cubes, and contingency tables. The goal is to
maximize the utility of their output, while giving a rigorous privacy
Most results follow a common template: pick a ``strategy'' set of linear
queries to apply to the data, then use the noisy answers to these
queries to reconstruct the queries of interest.
This entails either picking a strategy set that is hoped to be good
for the queries, or performing a costly search over the space
of all possible strategies.

In this paper, we propose a new approach that balances accuracy and
  we show how to improve the accuracy of a given query set by
  answering some strategy queries more accurately than others.
This leads to an efficient optimal noise allocation for many popular
including wavelets, hierarchies, Fourier coefficients and more.
For the important case of marginal queries we show that this strictly improves on previous
methods, both analytically and empirically.
Our results also extend to ensuring that the returned query answers
are consistent with an (unknown) data set at minimal extra cost in
terms of time and noise.}
  author = {Graham Cormode and Donatella Firmani},
  title = {On Unifying the Space of $l_0$-Sampling Algorithms},
  booktitle = {{SIAM} Meeting on Algorithm Engineering and Experiments},
  year = 2013,
  att_authors = {gc2602},
  att_private = {false},
  url = {../papers/l0samp.pdf},
  abstract = {
    The problem of building an $l_0$-sampler is to sample
    near-uniformly from the support set of a dynamic multiset.  
    This problem has a variety of applications within data analysis,
    computational geometry and graph algorithms. 
    In this paper, we abstract a set of steps for building an $l_0$-sampler, based on sampling, recovery and selection. 
    We analyze the implementation of an $l_0$-sampler within this
    framework, and show how prior constructions of 
    $l_0$-samplers can all be expressed in terms of these steps. 
    Our experimental contribution is to provide a first detailed study of the
    accuracy and computational cost of $l_0$-samplers.}
  author = {Amit Goyal and
               Hal {Daum{\'e} III} and
               Graham Cormode},
  title = {Sketch Algorithms for Estimating Point Queries in {NLP}},
  booktitle = {EMNLP-CoNLL},
  att_authors = {gc2602},
  att_private = {false},
  url = {../papers/nlpsketch.pdf},
  year = {2012},
  pages = {1093-1103},
  abstract = {
Many NLP tasks rely on accurate statistics
from large corpora. Tracking complete
statistics is memory intensive, so recent
work has proposed using compact approximate
``sketches'' of frequency distributions.
We describe 10 sketch methods, including existing
and novel variants. We compare and
study the errors (over-estimation and underestimation)
made by the sketches. We evaluate
several sketches on three important NLP problems.
Our experiments show that {\em one} sketch
performs best for all the three tasks.}
  title = {Mergeable Summaries},
  author = {Pankaj Agarwal and Graham Cormode and Zengfeng Huang and Jeff Phillips and Zheiwei Wei and Ke Yi},
  booktitle = {{ACM} Principles of Database Systems ({PODS})},
  att_authors = {gc2602},
  att_private = {false},
  year = 2012,
  url = {../papers/mergeable.pdf},
  abstract = {
We study the {\em mergeability} of data summaries.  Informally speaking,
mergeability requires that, given two summaries on two data sets, there is
a way to merge the two summaries into a single summary on the union of the two data sets, while preserving the error and size guarantees.  This property means that the summaries can be merged in a way like other algebraic
operators such as sum and max, 
which is especially useful for computing summaries on massive distributed
data.  Several data summaries are trivially mergeable by construction, most
notably all the {\em sketches} that are linear functions of the data sets.
But some other fundamental
ones like those for heavy hitters and quantiles, are not (known to be)
mergeable.  In this paper, we demonstrate that these summaries are indeed
mergeable or can be made mergeable after appropriate modifications.
Specifically, we show that for $\epsilon$-approximate heavy hitters, there is a
deterministic mergeable summary of size $O(1/\epsilon)$; for $\epsilon$-approximate
quantiles, there is a deterministic summary of size $O(\frac{1}{\epsilon}
\log(\epsilon n))$ that has a restricted form of mergeability, and a randomized
one of size $O(\frac{1}{\epsilon} \log^{3/2}\frac{1}{\epsilon})$ with full
mergeability.  We also extend our results to geometric summaries such as
$\epsilon$-approximations and $\epsilon$-kernels.

We also achieve two results of independent interest: 
(1) we provide the best known randomized streaming bound for $\epsilon$-approximate quantiles that depends only on $\epsilon$, of size $O(\frac{1}{\epsilon} \log^{3/2}\frac{1}{\epsilon})$, and 
(2) we demonstrate that the MG and the SpaceSaving summaries for heavy hitters are isomorphic.}
  author = {Edith Cohen and Graham Cormode and Nick Duffield},
  year = 2012,
  att_authors = {gc2602,nd1321},
  att_private = {false},
  booktitle = {{ACM} Conference on Measurement and Modeling of Computer Systems ({SIGMETRICS})},
  url = {../papers/sampdel.pdf},
  title = {Don't Let The Negatives Bring You Down: Sampling from Streams of Signed Updates},
  abstract = {
Random sampling has been proven time and time again to be a powerful tool 
for working with large data. Queries over the full dataset are replaced by 
approximate queries over the smaller (and hence easier to store and 
manipulate) sample. The sample constitutes a flexible summary that 
supports a wide class of queries. But in many applications, datasets are 
modified with time, and it is desirable to update samples without 
requiring access to the full underlying datasets. In this paper, we 
introduce and analyze novel techniques for sampling over dynamic data, 
modeled as a stream of modifications to weights associated with each key.

While sampling schemes designed for stream applications can often readily 
accommodate positive updates to the dataset, much less is known for the 
case of negative updates, where weights are reduced or items deleted 
altogether. We primarily consider the turnstile model of streams, and 
extend classic schemes to incorporate negative updates. Perhaps 
surprisingly, the modifications to handle negative updates turn out to be 
natural and seamless extensions of the well-known positive 
update-only algorithms. We show that they produce unbiased estimators, and 
we relate their performance to the behavior of corresponding algorithms on 
insert-only streams with different parameters. A careful analysis is 
necessitated, in order to account for the fact that sampling choices for 
one key now depend on the choices made for other keys.

In practice, our solutions turn out to be efficient and accurate.
Compared to recent algorithms for $L_p$ sampling which can be applied to this 
problem, they are significantly more reliable, and dramatically faster.   }
  author = {Graham Cormode and Ke Yi},
  year = 2012,
  att_authors = {gc2602},
  att_private = {false},
  booktitle = {Scientific and Statistical Database Management ({SSDBM})},
  slides = {../slides/cdsliwinssdbm.pdf},
  url = {../papers/cdsliwinssdbm.pdf},
  title = {Tracking Distributed Aggregates over Time-based Sliding Windows},
  abstract = {
The area of distributed monitoring requires tracking the value of a
function of distributed data as new observations are made.
An important case is when attention is restricted to only a recent
time period, such as the last hour of readings---the sliding window
In this paper, we introduce a novel paradigm for handling such
monitoring problems, which we dub the ``forward/backward'' approach. 
This view allows us to provide 
optimal or near-optimal solutions for several fundamental problems, such
as counting, tracking frequent items, and maintaining order
The resulting protocols  improve on
previous work or give the first solutions for some problems, 
and operate efficiently in terms of space and time needed.  
Specifically, we obtain optimal $O(\frac{k}{\epsilon} \log (\epsilon n/k))$
communication per window of $n$ updates for tracking counts and heavy
hitters with accuracy $\epsilon$ across $k$ sites; and near-optimal
communication of $O(\frac{k}{\epsilon} \log^2(1/\epsilon) \log (n/k))$
for quantiles.  
We also present solutions for problems such as tracking distinct
items, entropy, and convex hull and diameter of point sets.  
  author = {Graham Cormode and S. Muthukrishnan and Jinyun Yan},
  year = 2012,
  att_authors = {gc2602},
  att_private = {false},
  url = {../papers/fun12.pdf},
  link = {},
  booktitle = {Proceedings of the International Conference on Fun with Algorithms ({FUN})},
  title = {Scienceography: the study of how science is written},
  abstract = {
Scientific literature has itself been the subject of much scientific
study, for a variety of reasons: understanding how results are
communicated, how ideas spread, and assessing the influence of areas
or individuals.  
However, most prior work has focused on extracting and analyzing
citation and stylistic patterns. 
In this work, we introduce the notion of `scienceography', which
focuses on the {\em writing} of science. 
We provide a first large scale study using data derived from
the arXiv e-print repository. 
Crucially, our data includes the ``source code'' of scientific papers---the \LaTeX source---which enables us to study features not present in the ``final product'',
such as the tools used and private comments between authors. 
Our study identifies broad patterns and trends in two example areas---computer science and mathematics---as well as highlighting key
differences in the way that science is written in these fields. 
Finally, we outline future directions to extend the new
topic of scienceography.   
  author = {Graham Cormode and Magda Procopiuc and Divesh Srivastava and Thanh Tran},
  att_authors = {gc2602,ds8961,cp2838},
  att_private = {false},
  booktitle = {International Conference on Database Theory (ICDT)},
  url = {../papers/sparsedp.pdf},
  year = 2012,
  link = {},
  title = {Differentially Private Publication of Sparse Data},
  abstract = {The problem of privately releasing data is to provide a version of a 
dataset without revealing sensitive information about the individuals 
who contribute to the data.  The model of differential privacy allows 
such private release while providing strong guarantees on the output. 
A basic mechanism achieves differential privacy by adding noise to the 
frequency counts in the contingency tables (or, a subset of the count 
data cube) derived from the dataset.  However, when the dataset is 
sparse in its underlying space, as is the case for most multi-attribute 
relations, then the effect of adding noise is to vastly increase the 
size of the published data: it implicitly creates a huge number of 
dummy data points to mask the true data, making it almost impossible
to work with. 

We present techniques to overcome this roadblock and allow efficient 
private release of sparse data, while maintaining the guarantees of 
differential privacy.  Our approach is to release a compact summary of 
the noisy data.  Generating the noisy data and then summarizing it 
would still be very costly, so we show how to shortcut this step, and 
instead directly generate the summary from the input data, without 
materializing the vast intermediate noisy data.  We instantiate this 
outline for a variety of sampling and filtering methods, and show how 
to use the resulting summary for approximate, private, query answering. 
Our experimental study shows that this is an effective, practical
solution: in some examples we generate output that is 1000 times
smaller than the naive method, in less than 1\% of the time while
achieving comparable and occasionally improved utility over the 
costly materialization approach.
  author = {Graham Cormode and Magda Procopiuc and Entong Shen and Divesh Srivastava and Ting Yu},
  title = {Differentially Private Spatial Decompositions},
  booktitle = {International Conference on Data Engineering ({ICDE})},
  att_authors = {gc2602,ds8961,cp2838},
  att_private = {false},
  year = 2012,
  link = {},
  url = {../papers/spatialpriv.pdf},
  abstract = {Differential privacy has recently emerged as the de facto standard for private
data release.
This makes it possible to provide
strong theoretical guarantees on the privacy and utility of released
While it is well-understood how to release data based on counts and simple
functions under this guarantee, it remains to provide general purpose
techniques that are useful for a wider variety of queries.
In this paper, we focus on spatial data, i.e., any multi-dimensional
data that can be indexed by a tree structure.
Directly applying existing differential privacy methods to this type
of data simply generates noise.

We propose instead the class of ``private spatial decompositions'':
these adapt standard spatial indexing methods such
as quadtrees and kd-trees to provide a private description of the
data distribution.
Equipping such structures with differential privacy requires several
steps to ensure that they provide meaningful privacy guarantees.
Various basic steps, such as choosing splitting points and describing
the distribution of points within a region, must be done privately,
and the guarantees of the different building blocks must be composed into
an overall guarantee.
Consequently, we expose the design space for private spatial
decompositions, and analyze some key examples.
A major contribution of our work is to
provide new techniques for parameter setting and
post-processing of the output to improve
the accuracy of query answers.
Our experimental study demonstrates that it is possible to build such
decompositions efficiently, and use them to answer a variety of
queries privately and with high accuracy.}
  author = {Meiyu Lu and Srinivas Bangalore and Graham Cormode and 
Marios Hadjieleftheriou and Divesh Srivastava},
  title = {A Dataset Search Engine for the Research Document Corpus},
  att_authors = {gc2602,ds8961,sb7658,mh6516},
  att_private = {false},
  booktitle = {International Conference on Data Engineering ({ICDE})},
  url = {../papers/dataset.pdf},
  year = 2012,
  abstract = {
A key step in validating a proposed idea or system
is to evaluate over a suitable dataset. However, to this date there
have been no useful tools for researchers to understand which
datasets have been used for what purpose, or in what prior
work. Instead, they have to manually browse through papers
to find the suitable datasets and their corresponding URLs,
which is laborious and inefficient. To better aid the dataset
discovery process, and provide a better understanding of how
and where datasets have been used, we propose a framework to
effectively identify datasets within the scientific corpus. The key
technical challenges are identification of datasets, and discovery
of the association between a dataset and the URLs where they
can be accessed. Based on this, we have built a user friendly
web-based search interface for users to conveniently explore the
dataset-paper relationships, and find relevant datasets and their
  author = {Graham Cormode and Entong Shen and Divesh Srivastava and Ting Yu},
  booktitle = {International Conference on Data Engineering ({ICDE})},
  att_authors = {gc2602},
  att_private = {false},
  url = {../papers/licm.pdf},
  year = 2012,
  title = {Aggregate Query Answering on Possibilistic Data with Cardinality Constraints},
  abstract = {Uncertainties in data arise for a number of reasons: when the data
set is
incomplete, contains conflicting information or has been deliberately
perturbed or coarsened to remove sensitive details. An important case
which arises in many real applications
is when the data describes a set of {\em possibilities}, but with
{\em cardinality constraints}. These
constraints represent correlations between tuples encoding, e.g. that
at most two possible records are correct, or that there is an
(unknown) one-to-one mapping between a set of tuples and attribute
values. Although there has been much effort to handle
uncertain data, current systems are not equipped to handle such
correlations, beyond simple mutual exclusion and co-existence
constraints. Vitally, they have little support for efficiently handling
{\em aggregate} queries on such data.

In this paper, we aim to address some of these deficiencies, by
introducing LICM (Linear Integer Constraint Model), which can
succinctly represent many types of tuple correlations, particularly a
class of {\em cardinality constraints}. We motivate and explain the
model with examples from data cleaning and masking sensitive data,
to show that it enables
modeling and querying such data, which was not previously possible. We
develop an efficient strategy to answer conjunctive and aggregate queries
on possibilistic data by describing how to implement relational
operators over data in the model. LICM compactly integrates the
encoding of correlations, query answering and lineage recording. In
combination with off-the-shelf linear integer programming solvers, our
approach provides exact bounds for aggregate queries. Our prototype
implementation demonstrates that query answering with LICM can be
effective and scalable.}
  author = {Graham Cormode and Michael Mitzenmacher and Justin Thaler},
  title = {Practical Verified Computation with Streaming Interactive Proofs},
  booktitle = {Innovations in Theoretical Computer Science (ITCS)},
  year = 2012,
  att_authors = {gc2602},
  att_private = {false},
  url = {../papers/sipitcs.pdf},
  link = {},
  abstract = {
When delegating computation to a service provider, as in the cloud computing
paradigm, we
seek some reassurance that the output is correct and complete. 
Yet recomputing the output as a check is inefficient and expensive, and
it may not even be feasible to store all the data locally. 
We are therefore interested in
what can be validated by a streaming (sublinear space) user, who
cannot store the full input, or perform the full computation herself.
Our aim in this work is to advance a recent line of work on ``proof
systems'' in which the service provider {\em proves} the correctness of
its output to a user.
The goal is to minimize the time and space costs of both parties in
generating and checking the proof. 
Only very recently have there been attempts to implement such proof
systems, and thus far these have been quite limited in functionality.  

Here, our approach is two-fold. 
First, we describe a carefully crafted instantiation of one of the
most efficient 
general-purpose constructions for arbitrary computations 
(streaming or otherwise), due to Goldwasser,
Kalai, and Rothblum.
This requires several new insights and enhancements to 
move the methodology from a theoretical result to a practical
possibility. Our main contribution is in achieving a prover that 
runs in time $O(S(n) \log S(n))$, where $S(n)$ is the size of an arithmetic circuit computing the function of interest;
this compares favorably to the poly$(S(n))$ runtime for the prover promised in prior work. Our experimental results
demonstrate that a practical general-purpose protocol for verifiable computation may be significantly closer to reality than previously

Second, we describe a set of techniques that achieve
genuine scalability for protocols
fine-tuned for specific important problems in streaming and database
processing. Focusing in particular on \emph{non-interactive} protocols for problems ranging from matrix-vector multiplication
to bipartite perfect matching,
we build on prior work \cite{annotations, graphstream} to achieve a
prover that runs in nearly linear-time, 
while obtaining optimal tradeoffs between communication cost and the user's working memory. 
Existing techniques required (substantially)
superlinear time for the prover. Finally, we develop improved \emph{interactive} protocols for specific problems based on a linearization 
technique originally due to Shen. 
We argue that
even if general-purpose methods improve,
special purpose protocols will remain valuable in real-world settings for
key problems, and hence special attention to specific problems is
  title = {Verifying Computations with Streaming Interactive Proofs},
  author = {Graham Cormode and Justin Thaler and Ke Yi},
  booktitle = {International Conference on 
                  Very Large Data Bases ({VLDB})},
  month = sep,
  year = 2012,
  url = {../papers/ipstreamvldb.pdf},
  link = {},
  abstract = {
  When computation is outsourced, the data owner would like to be
  reassured that the desired computation has been performed correctly by
  the service provider.
  In theory, methods based on proof systems can give the data
    owner the necessary assurance, but previous work does not give a
    sufficiently scalable and practical solution, requiring a lot of time,
    space or computational power for both parties.  
  In this paper, we develop
    new proof protocols for verifying computations which are streaming in
    nature: the verifier (data owner) needs only logarithmic space and a single pass over the input, and
    after observing the input follows a simple
    protocol with a prover (service provider) that takes logarithmic
    communication spread over a logarithmic number
    of rounds.  
  These ensure that the computation is performed correctly: that the
  service provider has not made any errors or missed out some data. 
  The guarantee is very strong: even if the service provider
  deliberately tries to cheat, there is only vanishingly small
  probability of doing so undetected, while a correct computation is
  always accepted. 
  We first observe that some existing theoretical results 
  can be modified to work with streaming verifiers, 
  which shows that there exist efficient protocols for
  problems in the complexity classes $\mathsf{NP}$ and $\mathsf{NC}$. 
  Our main results 
  seek to bridge the gap between theory and practice by developing
    usable protocols
    for a variety of problems of central importance in 
    streaming and database processing. 
  All of our protocols achieve statistical 
    soundness and most require only logarithmic communication between
    prover and verifier.
  We also experimentally demonstrate their practicality and scalability. All
    these problems require linear space in the traditional streaming model,
    showing that adding a prover can exponentially reduce the effort needed
    by the verifier.}
  title = {Mergeable Coresets},
  author = {Pankaj Agarwal and Graham Cormode and Zengfeng Huang and Jeff Phillips and Zheiwei Wei and Ke Yi},
  booktitle = {Third Workshop on Massive Data Algorithmics (MASSIVE)},
  att_authors = {gc2602},
  att_private = {false},
  year = 2011,
  url = {../papers/massive.pdf},
  abstract = {
  We study the {\em mergeability} of data summaries.  Informally speaking,
  mergeability requires that, given two summaries on two data sets, there is
  a way to merge the two summaries into a summary on the two data sets
  combined together, while preserving the error and size guarantees.  This
  property means that the summary can be treated like other algebraic
  objects such as sum and max, 
  % essentially allows the summary to be used as if they were some
  %simple decomposable aggregates like sum and max, 
  which is especially useful for computing summaries on massive distributed
  data.  Many data summaries are trivially mergeable by construction, most
  notably those based on linear transformations.  But some other fundamental
  ones like those for heavy hitters and quantiles, are not (known to be)
  mergeable.  In this paper, we demonstrate that these summaries are indeed
  mergeable or can be made mergeable after appropriate modifications.
  Specifically, we show that for $\epsilon$-approximate heavy hitters, there is a
  deterministic mergeable summary of size $O(1/\epsilon)$; for $\epsilon$-approximate
  quantiles, there is a deterministic summary of size $O(\frac{1}{\epsilon}
  \log(\epsilon n))$ that has a restricted form of mergeability, and a randomized
  one of size $O(\frac{1}{\epsilon} \log^{3/2}\frac{1}{\epsilon})$ with full
  mergeability.  We also extend our results to geometric summaries such as
  $\epsilon$-approximations and $\epsilon$-kernels.
  author = {Graham Cormode},
  booktitle = {{ACM} {SIGKDD} Conference},
  att_authors = {gc2602},
  att_private = {false},
  year = 2011,
  url = {../papers/attackdd.pdf},
  poster = {../slides/attackddposter.pdf},
  link = {},
  title = {Personal Privacy vs Population Privacy: Learning to Attack Anonymization},
  abstract = {
Over the last decade
 great strides have been made in developing techniques to
compute functions privately. 
In particular, Differential Privacy gives strong promises about
conclusions that can be drawn about an individual. 
In contrast, various syntactic methods for providing privacy (criteria
such as $k$-anonymity and $l$-diversity) have been criticized for
still allowing private information of an individual to be inferred.
In this paper, we consider the ability of an attacker to use data
meeting privacy definitions to build an accurate classifier. 
We demonstrate that even under Differential Privacy, such classifiers can be
used to infer ``private'' attributes accurately in realistic data. 
We compare this to similar approaches for inference-based attacks on
other forms of anonymized data.
We show how the efficacy of all these attacks can be measured on the
same scale, based on the probability of successfully inferring a
private attribute. 
We observe that the
accuracy of inference of private attributes for differentially private
data and $l$-diverse data can be quite similar. }
  author = {Edith Cohen and Graham Cormode and Nick Duffield},
  att_authors = {gc2602,nd1321,ec1458},
  att_private = {false},
  booktitle = {International Conference on 
                  Very Large Data Bases ({VLDB})},
  year = 2011,
  slides = {../slides/structure-vldb.pdf},
  url = {../papers/rangeaware.pdf},
  title = {Structure-Aware Sampling: Flexible and Accurate Summarization},
  abstract = {
  In processing large quantities of data, a fundamental problem is to
  obtain a summary which supports approximate query answering.  Random
  sampling yields flexible summaries which naturally support subset-sum
  queries with unbiased estimators and well-understood confidence
  Classic sample-based summaries, however, are designed for arbitrary
  subset queries and are oblivious to the structure in the set of keys.
  The particular structure, such as hierarchy, order, or product space
  (multi-dimensional), makes {\em range queries} much more relevant for
  most analysis of the data.
  Dedicated summarization algorithms for range-sum queries have also been
  extensively studied.  They can outperform existing sampling schemes in
  terms of accuracy on range queries per summary size. Their accuracy,
  however, rapidly degrades when, as is often the case, the query spans
  multiple ranges.  
  They are also less flexible---being targeted for range sum
  queries alone---and are often quite costly to build and use.
  In this paper we propose and evaluate variance optimal sampling
  schemes that are {\em structure-aware}.  
  These summaries improve over 
  the accuracy of existing {\em structure-oblivious} sampling schemes on
  range queries while retaining the benefits of sample-based summaries:
  flexible summaries, with high accuracy on 
both range queries and arbitrary subset queries.
  author = {Graham Cormode and Ke Yi},
  att_authors = {gc2602},
  att_private = {false},
  booktitle = {{ACM} Principles of Distributed Computing ({PODC})},
  year = 2011,
  link = {},
  url = {../papers/cdhhba.pdf},
  slides = {../slides/cdslidingwin-ba.pdf},
  title = {Tracking Distributed Aggregates over Time-based Sliding Windows (Brief Announcement)},
  abstract = {The area of distributed monitoring requires tracking the value of a
function of distributed data as new observations are made.
An important case is when attention is restricted to only a recent
time period, such as the last hour of readings---the sliding window
In this announcement, we outline a novel paradigm for handling such
monitoring problems, which we dub the ``forward/backward'' approach. 
This provides clean solutions for several fundamental problems, such
as counting, tracking frequent items, and maintaining order
We obtain efficient protocols for these problems that improve on
previous work, and are easy to implement.  
Specifically, we obtain optimal $O(\frac{k}{\epsilon} \log (\epsilon n/k))$
communication per window of $n$ updates for tracking counts and heavy
hitters with accuracy $\epsilon$ across $k$ sites; and near-optimal
communication of $O(\frac{k}{\epsilon} \log^2(1/\epsilon)$ $\log (n/k))$
for quantiles.  }
  author = {Edith Cohen and Graham Cormode and Nick Duffield},
  booktitle = {{ACM} Conference on Measurement and Modeling of Computer Systems ({SIGMETRICS})},
  year = 2011,
  link = {},
  url = {../papers/structure-sigmetrics.pdf},
  slides = {../slides/structure-sigmetrics.pdf},
  title = {Structure-Aware Sampling on Data Streams},
  abstract = {
  The massive data streams observed in network monitoring, data processing and
  scientific studies are typically too large to store.  
  For many applications over such data, we must obtain compact summaries
  of the stream. 
  These summaries should allow accurate answering of post hoc queries
  with estimates which approximate the true answers over the original stream.
  The data often has an underlying structure which makes certain subset
  queries, in particular {\em range queries}, more relevant than arbitrary
  subsets.  Applications such as access control, change
  detection, and heavy hitters typically involve subsets that are ranges
  or unions thereof.
  Random sampling is a natural summarization tool, 
  being easy to implement and flexible to use. 
  Known sampling methods are 
  good for arbitrary queries but fail to optimize for the common
  case of range queries.  
  Meanwhile, specialized summarization algorithms have been proposed 
  for range-sum queries and related problems. 
  These can outperform sampling giving fixed space resources, but lack
  its flexibility and simplicity. 
  Particularly, their accuracy degrades when queries span multiple
  We define new stream sampling algorithms
  with a smooth and tunable trade-off between
  accuracy on range-sum queries and arbitrary subset-sum queries.  
  The technical key is to relax requirements on the variance over all
  subsets to enable better performance on the ranges of interest.
  This boosts the accuracy on range queries while retaining the prime
  benefits of sampling, in particular flexibility and accuracy, with tail
  bounds guarantees.
  Our experimental study indicates that structure-aware summaries
  drastically improve range-sum accuracy with respect to
  state-of-the-art stream sampling algorithms and outperform
  deterministic methods on range-sum queries and hierarchical heavy
  hitter queries.}
  author = {Graham Cormode and Howard Karloff and Tony Wirth},
  att_authors = {gc2602,hk1971},
  att_private = {false},
  title = {Set Cover Algorithms For Very Large Datasets},
  booktitle = {{ACM} Conference on Information and Knowledge Management ({CIKM})},
  year = 2010,
  url = {../papers/ckw.pdf},
  slides = {../slides/setcover.pdf},
  abstract = {
  The problem of {\sc Set Cover}---to find the smallest subcollection of
  sets that covers some universe---is at the heart of many data and
  analysis tasks.  It arises in a wide range of settings, including
  operations research, machine learning, planning, data quality and data
  mining. Although finding an optimal solution is NP-hard, the greedy
  algorithm is widely used, and typically finds solutions that are close
  to optimal. 
  However, a direct implementation of the greedy approach, which
  picks the set with the largest number of uncovered items at each step,
  does not behave well when the input is very large and disk resident. The greedy
  algorithm must make many random accesses to disk, which are
  unpredictable and costly in comparison to linear scans. In order to
  scale {\sc Set Cover} to large datasets, we provide a new
  algorithm which finds a solution that is provably close to that of
  greedy, but which is much more efficient to implement using modern
  disk technology. Our experiments show a ten-fold improvement in
  speed on moderately-sized datasets, and an even greater improvement on
larger datasets. }
  author = {Graham Cormode and S. Muthukrishnan and Ke Yi and Qin Zhang},
  att_authors = {gc2602},
  att_private = {false},
  title = {Optimal sampling from distributed streams},
  booktitle = {{ACM} Principles of Database Systems ({PODS})},
  year = 2010,
  url = {../papers/cdsample.pdf},
  slides = {../slides/cdsample.pdf},
  pubsite = {},
  abstract = {  A fundamental problem in data management is to draw a sample of a large
  data set, for approximate query answering, selectivity estimation, and
  query planning.  With large, streaming data sets, this problem becomes
  particularly difficult when the data is shared across multiple
  distributed sites.  The challenge is to ensure that a sample is drawn
  uniformly across the union of the data while minimizing the communication
  needed to run the protocol and track parameters of the evolving data.  At
  the same time, it is also necessary to make the protocol lightweight, by
  keeping the space and time costs low for each participant.
  In this paper, we present communication-efficient protocols for sampling
  (both with and without replacement) from $k$ distributed streams.  These
  apply to the case when we want a sample from the full streams, and to the
  sliding window cases of only the $W$ most recent items, or arrivals
  within the last $w$ time units.  We show that our protocols are optimal,
  not just in terms of the communication used, but also that they use
  minimal or near minimal (up to logarithmic factors) time to process each
  new item, and space to operate.}
  author = {Smriti Bhagat and Graham Cormode and Balachander Krishnamurthy and Divesh Srivastava},
  att_authors = {gc260,ds8961,bk1836},
  att_private = {false},
  title = {Privacy in dynamic social networks},
  booktitle = {World Wide Web Conference ({WWW})},
  year = 2010,
  pubsite = {},
  url = {../papers/danonwww.pdf},
  abstract = {
  Anonymization of social networks before they are published or shared
  has become an important research question. Recent work on anonymizing
  social networks has looked at privacy preserving techniques for
  publishing a single instance of the network. However, social networks
  evolve and a single instance is inadequate for analyzing the evolution
  of the social network or for performing any longitudinal data
  analysis. We study the problem of repeatedly publishing social network
  data as the network evolves, while preserving privacy of users. 
  Publishing multiple instances of the same network independently
  has privacy risks, since stitching the information
  together may allow an adversary to identify users in the networks.
  We propose methods to anonymize a dynamic network such that the privacy of
  users is preserved when new nodes and edges are added to the published
  These methods make 
  use of link prediction algorithms to model the evolution of the
  social network. Using this predicted graph to perform group-based
  anonymization, the loss in privacy caused by new edges can be reduced.
  We evaluate the privacy loss on publishing multiple social network instances using our methods.}
  author = {Graham Cormode and Michael Mitzenmacher and Justin Thaler},
  att_authors = {gc2602},
  att_private = {false},
  title = {Streaming Graph Computations with a Helpful Advisor},
  booktitle = {European Symposium on Algorithms},
  year = 2010,
  url = {../papers/graphip.pdf},
  link = {},
  abstract = {Motivated by the trend to outsource work to commercial cloud computing
services, we consider a variation of the streaming paradigm where a
streaming algorithm can be assisted by a powerful helper that can
provide annotations to the data stream.  We extend previous work on
such {\em annotation models} by considering a number of graph
streaming problems.  Without annotations, streaming algorithms for 
graph problems generally require significant memory;  we show that
for many standard problems, including all graph problems that can 
be expressed with totally unimodular integer programming formulations,
only constant memory is
needed for single-pass algorithms given linear-sized annotations.
We also obtain a protocol achieving \textit{optimal} tradeoffs between
annotation length and memory usage for matrix-vector multiplication;
this result contributes to a trend of recent research on numerical
linear algebra in streaming models.}
  author = {Amit Chakrabarti and Graham Cormode and Ranga Kondapally and Andrew McGregor},
  att_authors = {gc2602},
  att_private = {false},
  title = {Information Cost Tradeoffs for Augmented Index and Streaming Language Recognition},
  booktitle = {{IEEE} Foundations of Computer Science ({FOCS})},
  year = 2010,
  url = {../papers/pqfocs.pdf},
  link = {},
  abstract = {
This paper makes three main contributions to the theory of communication
complexity and stream computation. First, we present new bounds on the
information complexity of {\sc Augmented Index}. In contrast to analogous results for {\sc Index}
by Jain, Radhakrishnan and Sen [\emph{J.~ACM}, 2009], we have to overcome the
significant technical challenge that protocols for {\sc Augmented Index} may violate the
``rectangle property'' due to the inherent input sharing. Second, we use these
bounds to resolve an open problem of Magniez, Mathieu and Nayak [\emph{STOC},
2010] that asked about the multi-pass complexity of recognizing Dyck
languages. This results in a natural separation between the standard
multi-pass model and the multi-pass model that permits reverse passes.  Third,
we present the first \emph{passive memory checkers} that verify the
interaction transcripts of priority queues, stacks, and double-ended queues.
We obtain tight upper and lower bounds for these problems, thereby addressing
an important sub-class of  the memory checking framework of Blum et al.
[\emph{Algorithmica}, 1994]. }
  author = {Graham Cormode and Ninghui Li and Tiancheng Li and Divesh Srivastava},
  att_authors = {gc2602,ds8961},
  att_private = {false},
  title = {Minimizing Minimality and Maximizing Utility: 
Analyzing Method-based attacks on Anonymized Data},
  booktitle = {International Conference on 
                  Very Large Data Bases ({VLDB})},
  year = 2010,
  abstract = {The principle of {\em anonymization} for data sharing has become a very popular paradigm for the
preservation of privacy of the data subjects. Since the introduction of $k$-anonymity, dozens of
methods and enhanced privacy definitions have been proposed. However, over-eager attempts to
minimize the information lost by the anonymization potentially allow private information to be
inferred. Proof-of-concept of this ``minimality attack'' has been demonstrated for a variety of
algorithms and definitions \cite{WFW+09}.\\
In this paper, we provide a comprehensive analysis and study of this attack, and demonstrate that
with care its effect can be almost entirely countered. The attack allows an adversary to increase
his (probabilistic) belief in certain facts about individuals over the data. We show that (a) a
large class of algorithms are not affected by this attack, (b) for a class of algorithms that have
a ``symmetric'' property, the attacker's belief increases by at most a small constant, and (c) even
for an algorithm chosen to be highly susceptible to the attack, the attacker's belief when using
the attack increases by at most a small constant factor. We also provide a series of experiments
that show in all these cases that the confidence about the sensitive value of any individual
remains low in practice, while the published data is still useful for its intended purpose. From
this, we conclude that the impact of such method-based attacks can be minimized.}
  author = {Smriti Bhagat and Graham Cormode and Balachander Krishnamurthy and Divesh Srivastava},
  att_authors = {gc2602,ds8961,bk1836},
  att_private = {false},
  title = {Prediction Promotes Privacy In Dynamic Social Networks},
  booktitle = {Workshop on Online Social Networks ({WOSN})},
  year = 2010,
  url = {../papers/danonwosn.pdf},
  abstract = {
Recent work on anonymizing online social networks (OSNs) 
has looked at privacy preserving techniques 
for publishing a single instance of the network. However, OSNs evolve and 
a single instance is inadequate for analyzing their evolution or performing 
longitudinal data analysis. We study the problem of repeatedly publishing 
OSN data as the network evolves while preserving privacy of users. 
Publishing multiple instances independently has privacy risks, since stitching 
the information together may allow an adversary to identify users. We provide 
methods to anonymize a dynamic network when new nodes and edges are added 
to the published network.
These methods 
use link prediction algorithms to model the evolution. Using this predicted 
graph to perform group-based anonymization, the loss in privacy caused by 
new edges can be eliminated almost entirely. 
We propose metrics for privacy loss, and evaluate them for publishing multiple OSN instances.}
  title = {Class-based graph anonymization for social network data},
  author = {Smriti Bhagat and Graham Cormode and Balachander Krishnamurthy and Divesh Srivastava},
  att_authors = {gc2602,ds8961,bk1836},
  att_private = {false},
  year = 2009,
  booktitle = {International Conference on 
                  Very Large Data Bases ({VLDB})},
  url = {../papers/ganon.pdf},
  abstract = {
The recent rise in popularity of social networks, such as Facebook and
MySpace, has created large quantities of data about interactions
within these networks. Such data contains many private details about
individuals so anonymization is required prior to attempts to make
the data more widely available for scientific research. 
Prior work has considered simple graph data to be anonymized 
by removing all non-graph information and adding or deleting some edges. 
social network data is richer in details about the users and their
interactions, loss of details due to anonymization limits the
possibility for analysis.  
We present a new set of techniques for
anonymizing social network data based on grouping the entities into
classes, and masking the mapping between entities and the nodes that
represent them in the anonymized graph. Our techniques allow queries
over the rich data to be evaluated with high accuracy while
guaranteeing resilience to certain types of attack. 
To prevent inference of interactions, we rely on a critical 
``safety condition'' when forming these classes. 
We demonstrate utility via empirical data from social networking
settings. We give examples of complex
queries that may be posed and show that they can be answered over the
anonymized data efficiently and accurately.}
  title = {Probabilistic Histograms for Probabilistic Data},
  author = {Graham Cormode and Antonios Deligiannakis and Minos Garofalakis and Andrew McGregor},
  att_authors = {gc2602},
  att_private = {false},
  year = 2009,
  booktitle = {International Conference on 
                  Very Large Data Bases ({VLDB})},
  abstract = {
There is a growing realization that 
modern database management systems (DBMSs) must be able 
to manage data that contains {\em uncertainties} that are represented in the
form of probabilistic relations. 
Consequently, the design of each 
core DBMS component must be revisited in
the presence of uncertain and probabilistic information.  
In this paper, we study how to build histogram synopses for probabilistic
relations, for the purposes of enabling both  DBMS-internal decisions (such as
indexing and query planning), and (possibly, user-facing) approximate query
processing tools. 
In contrast to initial work in this area, our {\em probabilistic histograms\/}
retain the key
{\em possible-worlds semantics\/} of probabilistic data, allowing for
more accurate, yet concise,  representation of the uncertainty 
characteristics of data and query results.
We present a variety of techniques for building optimal 
probabilistic histograms, each one tuned to a different choice 
of approximation-error metric. 
We show that these can be incorporated into a general Dynamic Programming (DP)
framework, which generalizes that used for existing histogram constructions. 
The end result is a histogram where each ``bucket'' is approximately represented by a
compact probability distribution function (PDF), which can 
be used as the basis for query planning and approximate query answering. 
We present novel, polynomial-time algorithms to find optimal probabilistic 
histograms for a variety of PDF-error metrics (including 
variation distance, sum squared error, max error, 
and variants of EMD).  
Our experimental study shows that our probabilistic histogram synopses can 
accurately capture the key statistical properties of uncertain data, while being
much more compact to store and work with than the original uncertain
relations. }
  title = {Annotations in Data Streams},
  author = {Amit Chakrabarti and Graham Cormode and Andrew McGregor},
  booktitle = {International Colloquium on Automata, 
                  Languages and Programming ({ICALP})},
  year = 2009,
  att_authors = {gc2602},
  att_private = {false},
  url = {../papers/annotations.pdf},
  abstract = {
The central goal of data stream algorithms is to process massive streams
of data using {\em sublinear} storage space. 
Motivated by work in the database community on outsourcing database and data stream processing, we ask whether the space usage of such algorithms be further
reduced by enlisting a more powerful ``helper'' who can {\em annotate}
the stream as it is read. We do not wish to blindly trust the helper, so we
require that the algorithm be convinced of having computed a correct
We show
upper bounds that achieve a non-trivial tradeoff between the
amount of annotation used and the space required to verify it.  We also
prove lower bounds on such tradeoffs, often nearly matching the upper
bounds, via notions related to Merlin-Arthur communication complexity.
Our results cover the classic data stream problems of selection,
frequency moments, and fundamental graph problems such as
triangle-freeness and connectivity. 
  Our work is also part of a
growing trend --- including recent studies of multi-pass streaming,
read/write streams and randomly ordered streams --- of asking more
complexity-theoretic questions about data stream processing. 
It is a recognition that, in addition to practical relevance, the
data stream model
raises many interesting theoretical questions in its own right. }
  title = {Estimating the Confidence of Conditional Functional Dependencies},
  author = {Graham Cormode and Lukasz Golab and Flip Korn and Andrew McGregor
   			and Divesh Srivastava and Xi Zhang},
  att_authors = {gc2602,ds8961,pk1785,lg1173},
  att_private = {false},
  booktitle = {{ACM} {SIGMOD} International 
                    Conference on Management of Data ({SIGMOD})},
  year = 2009,
  url = {../papers/cfdest.pdf},
  abstract = {
  Conditional functional dependencies (CFDs) have recently been proposed as
  extensions of classical functional dependencies that apply to a certain
  subset of the relation, as specified by a \emph{pattern tableau}.  
  Calculating the support and confidence of a CFD (i.e., the size of the 
  applicable subset and the extent to which it satisfies the CFD)
  gives valuable information about data semantics and data quality.  
  While computing the support is easier, computing the confidence 
  exactly is expensive if the relation is large, and estimating it from a 
  random sample of the relation is unreliable unless the sample is large.  
  We study how to efficiently estimate the confidence of a CFD 
  with a small number of passes (one or two) over the input using small space. 
  Our solutions are based on a variety of sampling and sketching techniques,
  and apply when the pattern tableau is known in advance, and also the harder 
  case when this is given after the data have been seen. 
  We analyze our algorithms, and show that they can guarantee 
  a small additive error; we also show that relative errors guarantees are not possible. 
  We demonstrate the power of these methods empirically, with a detailed study over
  a mixture of real and synthetic data. 
  These experiments show that it is possible to estimates the CFD confidence
  very accurately with summaries which are much smaller than the size of the data they represent.}
  title = {Space-optimal Heavy Hitters with Strong Error Bounds},
  author = {Radu Berinde and Graham Cormode and Piotr Indyk and Martin Strauss},
  att_authors = {gc2602},
  att_private = {false},
  booktitle = {{ACM} Principles of Database Systems ({PODS})},
  year = 2009,
  url = {../papers/counters.pdf},
  slides = {../slides/counters.pdf},
  abstract = {
  The problem of finding heavy-hitters and approximating the frequencies
  of items is at the heart of many problems in data stream analysis. It
  has been observed that several proposed solutions to this problem can
  outperform their worst-case guarantees on real data. This leads to the
  question of whether some stronger bounds can be guaranteed. We answer
  this in the positive by showing that a class of ``counter-based
  algorithms'' (including the popular and very space-efficient Frequent
  and Spacesaving algorithms) provide much stronger approximation
  guarantees than previously known. Specifically, we show that errors in
  the approximation of individual elements do not depend on the
  frequencies of the most frequent elements, but only on the frequency
  of the remaining ``tail''.
  This shows that counter-based methods are the
  most space-efficient (in fact, space-optimal) algorithms having this
  strong error bound. 
  This tail guarantee allows these algorithms to solve the ``sparse
  recovery'' problem. Here, the goal is to recover a faithful
  representation of the vector of frequencies, $f$. We prove that using
  space $O(k)$, the algorithms construct an approximation $f^*$ to the
  frequency vector $f$ so that the L1 error $||f-f^*||_1$ is close to the best
  possible error $\min_{f'} ||f' - f||_1$, where $f'$ ranges over all vectors
  with at most $k$ non-zero entries. This improves the previously best
  known space bound of about $O(k \log n)$ for streams without element
  deletions. Other consequences of the tail guarantees are results for skewed
  (Zipfian) data, and guarantees for accuracy of merging multiple
summarized streams.  }
  title = {Time-Decayed Correlated Aggregates over Data Streams},
  author = {Graham Cormode and Srikanta Tirthapura and Bojian Xu},
  booktitle = {{SIAM} Conference on Data Mining ({SDM})},
  att_authors = {gc2602},
  att_private = {false},
  year = 2009,
  url = {../papers/corag.pdf},
  abstract = {
Data stream analysis frequently relies on identifying correlations
and posing conditional queries on the data after it has been seen.
Correlated aggregates form an important example of such queries,
which ask for an aggregation over one dimension of stream elements
which satisfy a predicate on another dimension. Since recent
events are typically more important than older ones, time decay
should also be applied to downweight less significant values. We
present space-efficient algorithms as well as space lower bounds
for the time-decayed correlated sum, a problem at the heart of many
related aggregations. By considering different fundamental classes
of decay functions, we separate cases where efficient relative error
or additive error is possible, from other cases where linear space is
necessary to approximate. In particular, we show that no efficient
algorithms are possible for the popular sliding window and exponential
decay models, resolving an open problem. The results are
surprising, since efficient approximations are known for other data
stream problems under these decay models. This is a step towards
better understanding which sophisticated queries can be answered
on massive streams using limited memory and computation.  
  title = {Histograms and Wavelets on Probabilistic Data},
  author = {Graham Cormode and Minos Garofalakis},
  att_authors = {gc2602},
  att_private = {false},
  booktitle = {International Conference on Data Engineering ({ICDE})},
  year = 2009,
  slides = {../slides/phistslides.pdf},
  url = {../papers/phist.pdf},
  note = {Best paper award},
  abstract = {
There is a growing realization that uncertain information
is a first-class citizen in modern database management.
As such, we need techniques to correctly and efficiently
process uncertain data in database systems. In particular, data
reduction techniques that can produce concise, accurate synopses
of large probabilistic relations are crucial. Similar to their
deterministic relation counterparts, such compact probabilistic
data synopses can form the foundation for human understanding
and interactive data exploration, probabilistic query planning
and optimization, and fast approximate query processing in
probabilistic database systems.
In this paper, we introduce definitions and algorithms for
building histogram- and Haar wavelet-based synopses on probabilistic
data. The core problem is to choose a set of histogram
bucket boundaries or wavelet coefficients to optimize the accuracy
of the approximate representation of a collection of probabilistic
tuples under a given error metric. For a variety of different
error metrics, we devise efficient algorithms that construct
optimal or near optimal size B histogram and wavelet synopses.
This requires careful analysis of the structure of the probability
distributions, and novel extensions of known dynamicprogramming-
based techniques for the deterministic domain.
Our experiments show that this approach clearly outperforms
simple ideas, such as building summaries for samples drawn 
from the data distribution, while taking equal or less time.}
  title = {Semantics of Ranking Queries for Probabilistic Data and Expected Ranks},
  author = {Graham Cormode and Feifei Li and Ke Yi},
  att_authors = {gc2602},
  att_private = {false},
  booktitle = {International Conference on Data Engineering ({ICDE})},
  year = 2009,
  url = {../papers/exprank.pdf},
  abstract = {
When dealing with massive quantities of data, topk
queries are a powerful technique for returning only the k
most relevant tuples for inspection, based on a scoring function.
The problem of efficiently answering such ranking queries has
been studied and analyzed extensively within traditional database
settings. The importance of the top-k is perhaps even greater in
probabilistic databases, where a relation can encode exponentially
many possible worlds. There have been several recent attempts
to propose definitions and algorithms for ranking queries over
probabilistic data. However, these all lack many of the intuitive
properties of a top-k over deterministic data. Specifically, we
define a number of fundamental properties, including exact-k,
containment, unique-rank, value-invariance, and stability, which
are all satisfied by ranking queries on certain data. We argue
that all these conditions should also be fulfilled by any reasonable
definition for ranking uncertain data. Unfortunately, none of the
existing definitions is able to achieve this.
To remedy this shortcoming, this work proposes an intuitive
new approach of expected rank. This uses the well-founded notion
of the expected rank of each tuple across all possible worlds
as the basis of the ranking. We are able to prove that, in
contrast to all existing approaches, the expected rank satisfies
all the required properties for a ranking query. We provide
efficient solutions to compute this ranking across the major
models of uncertain data, such as attribute-level and tuple-level
uncertainty. For an uncertain relation of N tuples, the processing
cost is O(N logN)no worse than simply sorting the relation.
In settings where there is a high cost for generating each tuple
in turn, we show pruning techniques based on probabilistic tail
bounds that can terminate the search early and guarantee that
the top-k has been found. Finally, a comprehensive experimental
study confirms the effectiveness of our approach.}
  author = {Graham Cormode and Vladislav Shkapenyuk and Divesh Srivastava and Bojian Xu},
  title = {Forward Decay: A Practical Time Decay Model for Streaming Systems},
  booktitle = {International Conference on Data Engineering ({ICDE})},
  year = 2009,
  att_authors = {gc2602,ds8961,vs9593},
  att_private = {false},
  url = {../papers/fwddecay.pdf},
  abstract = {
Temporal data analysis in data warehouses and data
streaming systems often uses time decay to reduce the importance
of older tuples, without eliminating their influence, on the results
of the analysis. Exponential time decay is commonly used in
practice, even though other decay functions (e.g., polynomial
decay) have also been identified as useful. We argue that this
is because the usual definitions of time decay are backwards:
the decayed weight of a tuple is based on its age, measured
backward from the current time. Since this age is constantly
changing, such decay is too complex and unwieldy for scalable
In this paper, we propose a new class of forward decay
functions based on measuring forward from a fixed point in
time. We show that this model captures the more practical
models already known, such as exponential decay and landmark
windows, but also includes a wide class of other types of time
decay. We provide efficient algorithms to compute a variety of
aggregates and draw samples under forward decay, and show
that these are easy to implement scalably. Further, we provide
empirical evidence that these can be executed in a production
data stream management system with little or no overhead
compared to the undecayed computations. Our implementation
required no extensions to the query language or the DSMS,
demonstrating that forward decay represents a practical model
of time decay for systems that deal with time-based data.}
  author = {Graham Cormode and Marios Hadjieleftheriou},
  title = {Finding Frequent Items in Data Streams},
  att_authors = {gc2602,mh6516},
  att_private = {false},
  booktitle = {International Conference on 
                  Very Large Data Bases ({VLDB})},
  year = 2008,
  url = {../papers/freq.pdf},
  note = {Best paper award},
  pubsite = {},
  abstract = {
The frequent items problem is to process a stream of items and find
all items occurring more than a given fraction of the time. It is one
of the most heavily studied problems in data stream mining, dating
back to the 1980s. Many applications rely directly or indirectly
on finding the frequent items, and implementations are in use in
large scale industrial systems. However, there has not been much
comparison of the different methods under uniform experimental
conditions. It is common to find papers touching on this topic in
which important related work is mischaracterized, overlooked, or
In this paper, we aim to present the most important algorithms
for this problem in a common framework. We have created baseline
implementations of the algorithms, and used these to perform a
thorough experimental study of their properties. We give empirical
evidence that there is considerable variation in the performance of
frequent items algorithms. The best methods can be implemented
to find frequent items with high accuracy using only tens of kilobytes
of memory, at rates of millions of items per second on cheap
modern hardware.}
  author = {Graham Cormode and Divesh Srivastava and Ting Yu and Qing Zhang},
  title = {Anonymizing Bipartite Graph Data using Safe Groupings},
  att_authors = {gc2602,ds8961},
  att_private = {false},
  booktitle = {International Conference on 
                  Very Large Data Bases ({VLDB})},
  year = 2008,
  url = {../papers/gpriv.pdf},
  abstract = {Private data often comes in the form of associations between entities, such
as customers and products bought from a pharmacy, which are naturally
represented in the form of a large, sparse bipartite graph. As with tabular
data, it is desirable to be able to publish anonymized versions of such data,
to allow others to perform ad hoc analysis of aggregate graph properties.
However, existing tabular anonymization techniques do not give useful or
meaningful results when applied to graphs: small changes or masking of
the edge structure can radically change aggregate graph properties.
We introduce a new family of anonymizations, for bipartite graph data,
called (k, l)-groupings. These groupings preserve the underlying graph
structure perfectly, and instead anonymize the mapping from entities to
nodes of the graph. We identify a class of safe (k, l)-groupings that have
provable guarantees to resist a variety of attacks, and show how to find such
safe groupings. We perform experiments on real bipartite graph data to
study the utility of the anonymized version, and the impact of publishing
alternate groupings of the same graph data. Our experiments demonstrate
that (k, l)-groupings offer strong tradeoffs between privacy and utility.}
  author = {Graham Cormode and Flip Korn and Srikanta Tirthapura},
  title = {Time-Decaying Aggregates in Out-of-order Streams},
  att_authors = {gc2602,pk1785},
  att_private = {false},
  booktitle = {{ACM} Principles of Database Systems ({PODS})},
  year = 2008,
  url = {../papers/decaypods.pdf},
  slides = {../slides/decay-pods.pdf},
  abstract = {
  Processing large data streams is now a major topic in data management.
  The data involved can be truly massive, and the required
  analyses complex. In a stream of sequential events such as stock
  feeds, sensor readings, or IP traffic measurements, data tuples pertaining
  to recent events are typically more important than older
  ones. This can be formalized via time-decay functions, which assign
  weights to data based on the age of data. Decay functions
  such as sliding windows and exponential decay have been studied
  under the assumption of well-ordered arrivals, i.e., data arrives in
  non-decreasing order of time stamps. However, data quality issues
  are prevalent in massive streams (due to network asynchrony and
  delays etc.), and correct arrival order is not guaranteed.

  We focus on the computation of decayed aggregates such as
  range queries, quantiles, and heavy hitters on out-of-order streams,
  where elements do not necessarily arrive in increasing order of
  timestamps. Existing techniques such as Exponential Histograms
  and Waves are unable to handle out-of-order streams. We give the
  first deterministic algorithms for approximating these aggregates
  under popular decay functions such as sliding window and polynomial
  decay. We study the overhead of allowing out-of-order arrivals
  when compared to well-ordered arrivals, both analytically and experimentally.
  Our experiments confirm that these algorithms can be
  applied in practice, and compare the relative performance of different
approaches for handling out-of-order arrivals.}
  author = {Graham Cormode and Andrew McGregor},
  title = {Approximation Algorithms for Clustering Uncertain Data},
  booktitle = {{ACM} Principles of Database Systems ({PODS})},
  att_authors = {gc2602},
  att_private = {false},
  year = 2008,
  url = {../papers/pclust.pdf},
  slides = {../slides/pclust-pods.pdf},
  abstract = {
  There is an increasing quantity of data with uncertainty arising from
  applications such as sensor network measurements, record linkage, and
  as output of mining algorithms. 
  This uncertainty is typically formalized as probability density
  functions over tuple values. 
  Beyond storing and processing such data in a DBMS, it is necessary to
  perform other data analysis tasks such as data mining. 
  We study the core mining problem of {\em clustering} on uncertain
  data, and define appropriate natural generalizations of standard
  clustering optimization criteria. 
  Two variations arise, depending on whether a point is automatically
  associated with its optimal center, or whether it must be assigned to
  a fixed cluster no matter where it is actually located. 
  For uncertain versions of $k$-means and $k$-median, we show reductions
  to their corresponding weighted versions on data with no
  These are simple in the unassigned case, but require some care for the
  assigned version. 
  Our most interesting results are for uncertain $k$-center, 
  which generalizes both traditional $k$-center and $k$-median objectives.
  We show a variety of bicriteria approximation algorithms. 
  One picks ${poly}(\log n)$ more centers and achieves a
  $(1+\epsilon)$ approximation to the best uncertain $k$-centers. 
  Another picks $2k$ centers and achieves a constant factor
  Collectively, these results are the first known guaranteed
  approximation algorithms for the problems of clustering uncertain
data.  }
  author = {Graham Cormode and Flip Korn and S. Muthukrishnan and Divesh Srivastava},
  att_authors = {gc2602,ds8961,pk1785},
  att_private = {false},
  booktitle = {Scientific and Statistical Database Management ({SSDBM})},
  title = {Summarizing Two-Dimensional Data
                  with Skyline-based Statistical Descriptors},
  year = 2008,
  url = {../papers/2dquant.pdf},
  abstract = {
Much real data consists of more than one dimension,
such as financial transactions (eg, price $\times$ volume)
and IP network flows (eg, duration $\times$ numBytes),
and capture relationships between the variables.
For a single dimension, quantiles are intuitive and robust descriptors.
Processing and analyzing such data, particularly in data warehouse or data
streaming settings, requires similarly robust and informative
statistical descriptors that go beyond one-dimension.
Applying quantile methods to summarize a multidimensional
distribution along only singleton attributes
ignores the rich dependence amongst the variables.

In this paper, we present new skyline-based statistical
descriptors for capturing the distributions over pairs of dimensions.
They generalize the notion of quantiles in the individual
dimensions, and also incorporate properties of the joint distribution.
We introduce {\em $\phi$-quantours} and {\em $\alpha$-radials}, 
which are skyline points over subsets of the data, and propose
{\em ($\phi, \alpha$)-quantiles}, found from the union of these skylines,
as statistical descriptors of two-dimensional distributions.
We present efficient online algorithms for tracking
($\phi,\alpha$)-quantiles on two-dimensional streams
using guaranteed small space.
We identify the principal properties of the proposed descriptors
and perform extensive experiments with synthetic and real IP traffic data
to study the efficiency of our
proposed algorithms.}
  author = {Amit Chakrabarti and Graham Cormode and Andrew McGregor},
  att_authors = {gc2602},
  att_private = {false},
  booktitle = {{ACM} Symposium on Theory of Computing ({STOC})},
  title = {Robust Lower Bounds for Communication and Stream Computation},
  year = 2008,
  url = {../papers/robustbounds.pdf},
  slides = {../slides/rcc-stoc.pdf},
  abstract = {
In this paper we study the communication complexity of evaluating
functions when the input data is distributed (according to some known
distribution) to two or more players. This naturally extends
previous models such as the best-case and worst-case partition models
to a model that captures random
partitions and distributions with redundancy  where bits of data may
be known to more than one player. 
This allows us to distinguish functions which require significant
communication for almost every allocation of input from those for
which only a few cases are hard. 

There is also a strong connection to proving lower-bounds in the data
stream model that are ``robust'' to the ordering of the data. 
That is, we prove lower-bounds for when
  order of the data-stream is not chosen adversarially
  but rather is determined uniformly (or near-uniformly) 
  from the set of all permuations. 
This streaming model has attracted recent interest, since lower bounds
  here give stronger evidence for the inherent hardness of streaming

Our results include the first average-partition communication
lower-bounds for problems including multi-party set-disjointness and
gap-hamming. Both are tight. We also improve extend and improve
previous results for a form of pointer-jumping
 that is relevant to the problem of selection. Collectively, 
these results yield lower-bounds for a
variety of problems in the random-order, data-stream model including
estimating the number of distinct elements, approximating frequency
moments, and quantile estimation.}
  author = {Graham Cormode and Flip Korn and S. Muthukrishnan and 
Yihua Wu},
  att_authors = {gc2602,pk1785},
  att_private = {false},
  title = {On Signatures for Communication Graphs},
  booktitle = {International Conference on Data Engineering ({ICDE})},
  url = {../papers/graphsig.pdf},
  year = 2008,
  abstract = {
Communications between individuals can be represented by (weighted, 
Many applications operate on communication graphs
associated with telephone calls, emails, Instant Messages (IM), 
blogs, web forums, e-business relationships and so on. 
These applications include identifying repetitive
fraudsters, message board aliases, multihoming of IP addresses, etc. 
Tracking such electronic
identities on communication network can be achieved if we have a
reliable ``signature'' for nodes and activities. While many examples
of adhoc signatures can be proposed for particular tasks, what is needed
is a systematic study of the principles behind the usage of signatures
in any task. 

We develop a formal framework for the use of signatures
on communication graphs and identify three fundamental properties
that are natural to signature schemes: persistence, uniqueness and
robustness.  We justify these properties by showing how they impact
a set of applications.  We then explore several signature
schemes --- previously defined or new --- in our framework 
and evaluate them on real data in terms
of these properties.  This provides insights into suitable
signature schemes for desired applications.
Finally, as case studies, we focus on the concrete application 
of enterprise network traffic.}
  author = {Graham Cormode and Flip Korn and Srikanta Tirthapura},
  att_authors = {gc2602,pk1785},
  att_private = {false},
  title = {Exponentially Decayed Aggregates on Data Streams},
  booktitle = {International Conference on Data Engineering ({ICDE})},
  year = 2008,
  slides = {../slides/expdecay.pdf},
  url = {../papers/expdecay.pdf},
  abstract = {In a massive stream of sequential events such as
stock feeds, sensor readings, or IP traffic measurements, tuples
pertaining to recent events are typically more important than
older ones. It is important to compute various aggregates over
such streams after applying a decay function which assigns
weights to tuples based on their age. We focus on the computation
of exponentially decayed aggregates in the form of quantiles and
heavy hitters. Our techniques are based on extending existing
data stream summaries, such as the q-digest and the``spacesaving''
algorithm. Our experiments confirm that our methods
can be applied in practice, and have similar space and time costs
to the non-decayed aggregate computation.}
  author = {G. Cormode and S. Muthukrishnan and K. Yi},
  att_authors = {gc2602},
  att_private = {false},
  title = {Algorithms for Distributed, Functional Monitoring},
  booktitle = {{ACM}-{SIAM} Symposium on Discrete Algorithms ({SODA})},
  year = 2008,
  url = {../papers/soda08.pdf},
  abstract = {
We study what we call {\em functional monitoring} problems. 
We have 
$k$ players each tracking their inputs, say $A_j(t)$ up until time $t$, 
and communicating with a central coordinator. 
The coordinator's task is to
compute a given function $f$ over the union of the inputs
$\cup_{i} A_j(t)$, {\em continuously} at all times $t$. 
The goal is to minimize the number of bits communicated between the 
players and the coordinator.
A simple example is when $f$ is simply the sum, and the coordinator
is required to alert when the sum of a distributed 
set of values exceeds a given threshold. 
Of interest is the approximate version where the coordinator outputs
$1$ if $f \geq \tau$ and $0$ is $f\leq \tau-\epsilon$. 
This defines
the $(k,f,\tau,\epsilon)$ distributed, functional monitoring problem.

Functional monitoring problems are fundamental in distributed systems, 
in particular sensor networks, where we must minimize communication; they also
connect to problems in communication complexity, communication theory, 
and signal processing. Yet few formal bounds are known for 
functional monitoring. 

We give upper and lower bounds for 
the $(k,f,\tau,\epsilon)$ problem for foundational $f$'s. 
In particular, we study frequency moments ($F_0, F_1, F_2$). 
We show that for the simplest cases, the cost of monitoring a
function can be the same as the cost of its one-time computation. 
However, for $F_2$ we show a carefully constructed multi-round
algorithm which uses ``sketch summaries'' at multiple levels of detail
and solves the $(k,F_2,\tau,\epsilon)$ problem  
with communication ${O}$~$(k^2/\epsilon + k^{3/2}/\epsilon^3)$.
Since frequency moment estimation is central to other problems,
our results have immediate applications to histograms, wavelet computations,
and others. Our algorithmic techniques are likely to be useful for other 
functional monitoring problems. 
  author = {S. Bhagat and G. Cormode and I. Rozenbaum},
  att_authors = {gc2602},
  att_private = {false},
  title = {Applying Link-based Classification to Label Blogs},
  booktitle = {Joint WEBKDD and SNA-KDD Workshop},
  year = {2007},
  url = {../papers/sigkdd07.pdf},
  slides = {../slides/SNAKDD-blogs.pdf},
  abstract = {
In analyzing data from social and communication networks, we encounter
the problem of classifying objects where there is an explicit
link structure amongst the objects. We study the problem of inferring
the classification of all the objects from a labeled subset, using
only the link-based information amongst the objects.
We abstract the above as a labeling problem on multigraphs with
weighted edges. We present two classes of algorithms, based on
local and global similarities. Then we focus on multigraphs induced
by blog data, and carefully apply our general algorithms to
specifically infer labels such as age, gender and location associated
with the blog based only on the link-structure amongst them. We
perform a comprehensive set of experiments with real, large-scale
blog data sets and show that significant accuracy is possible from
little or no non-link information, and our methods scale to millions
of nodes and edges.}
  author = {S. Ganguly and G. Cormode},
  att_authors = {gc2602},
  att_private = {false},
  title = {On Estimating Frequency Moments of Data Streams},
  booktitle = {Proceedings of {RANDOM}},
  year = {2007},
  url = {../papers/gc-l1.pdf},
  abstract = {Space-economical  estimation of the $p$th frequency moments, defined as
$F_p = \Sigma_{i=1}^n |{f_i}|^p$, for $p > 0$,  are of interest in
estimating all-pairs distances in a large data
machine learning, and in data stream computation. 
Random sketches formed by the inner product  of the
frequency vector $f_1, \ldots, f_n$ with a suitably chosen random
vector were pioneered by Alon, Matias and Szegedy, and 
have since played a
central role in estimating $F_p$ and for data stream computations in
The concept of $p$-stable sketches formed by the inner product of the 
vector with a random vector whose components are drawn from a
$p$-stable distribution, was proposed by Indyk
for estimating $F_p$, for $0 < p < 2$, and has been further studied
by Li.

In this paper, we consider the problem of estimating $F_p$, for $0 <
p < 2$.  A disadvantage of the stable sketches technique and its
variants is that they require $O(\frac{1}{\epsilon^2})$ 
inner-products of the frequency
vector with dense vectors of stable 
(or nearly stable) random variables to be maintained.
This means that each stream update can be quite time-consuming. 
We present algorithms for estimating $F_p$, for $0 < p < 2$, 
that does not require the use of stable sketches or its
Our  technique is elementary in nature, in that, it
uses simple randomization in conjunction with well-known summary
structures for data streams, such as the CM sketch 
sketch and the Count sketch structure.
Our algorithms require space
${O}~(\frac{1}{\epsilon^{2+p}})$ to estimate $F_p$ to within $1 +/-
\epsilon$ factors and requires expected time $O(\log F_1 \log
\frac{1}{\delta})$ to process each update. Thus, our technique
trades an $O(\frac{1}{\epsilon^p})$ factor in space for
much more efficient processing of stream updates. 
We also present a stand-alone iterative estimator for $F_1$.
  author = {G. Cormode and S. Tirthapura and B. Xu},
  att_authors = {gc2602},
  att_private = {false},
  title = {Time-Decaying Sketches for Sensor Data Aggregation},
  booktitle = {{ACM} Principles of Distributed Computing ({PODC})},
  year = {2007},
  url = {../papers/timedecaypodc.pdf},
  abstract = {We present a new sketch for summarizing network data. 
The sketch has the following properties which make it useful in
communication-efficient aggregation in distributed streaming
scenarios, such as sensor networks: the sketch is duplicateinsensitive,
i.e. re-insertions of the same data will not affect
the sketch, and hence the estimates of aggregates. Unlike
previous duplicate-insensitive sketches for sensor data aggregation, 
it is also time-decaying, so that the weight
of a data item in the sketch can decrease with time according
to a user-specified decay function. The sketch can give
provably approximate guarantees for various aggregates of
data, including the sum, median, quantiles, and frequent elements.
The size of the sketch and the time taken to update
it are both polylogarithmic in the size of the relevant data.
Further, multiple sketches computed over distributed data
can be combined with the same guarantees. To our knowledge,
this is the first sketch that combines all the above
  att_authors = {gc2602},
  att_private = {false},
  author = {G. Cormode and M. Garofalakis},
  title = {Sketching Probabilistic Data Streams},
  booktitle = {{ACM} {SIGMOD} International 
                    Conference on Management of Data ({SIGMOD})},
  year = {2007},
  url = {../papers/probstreams.pdf},
  slides = {../slides/probstreams.pdf},
  abstract = {The management of uncertain, probabilistic data has 
recently emerged
as a useful paradigm for dealing with the inherent unreliabilities
of several real-world application domains, including data cleaning,
information integration, and pervasive, multi-sensor computing.
Unlike conventional data sets, a set of probabilistic tuples defines
a probability distribution over an exponential number of possible
worlds (i.e., grounded, deterministic databases). This possible
worlds interpretation allows for clean query semantics but
also raises hard computational problems for probabilistic database
query processors. To further complicate matters, in many scenarios
(e.g., large-scale process and environmental monitoring using multiple
sensor modalities), probabilistic data tuples arrive and need to
be processed in a streaming fashion; that is, using limited memory
and CPU resources and without the benefit of multiple passes over a
static probabilistic database. Such probabilistic data streams raise a
host of new research challenges for stream-processing engines that,
to date, remain largely unaddressed.

In this paper, we propose the first space- and time-efficient algorithms
for approximating complex aggregate queries (including,
the number of distinct values and join/self-join sizes) over probabilistic
data streams. Following the possible-worlds semantics,
such aggregates essentially define probability distributions over the
space of possible aggregation results, and our goal is to characterize
such distributions through efficient approximations of their key
moments (such as expectation and variance). Our algorithms offer
strong randomized estimation guarantees while using only sublinear
space in the size of the stream(s), and rely on novel, concise
streaming sketch synopses that extend conventional sketching ideas
to the probabilistic streams setting. Our experimental results verify
the effectiveness of our approach. }
  att_authors = {gc2602},
  att_private = {false},
  author = {S. Bhagat and G. Cormode and S. Muthukrishnan and 
                     I. Rozenbaum and H. Xue},
  title = {No Blog is an Island Analyzing Connections Across 
                     Information Networks},
  year = {2007},
  url = {../papers/icwsm07.pdf},
  booktitle = {International Conference on Weblogs and Social Media},
  abstract = {
There are multiple information and social networks among individuals,
from telephone to email, web, blog, instant message (IM) and
chat networks. Prior work has studied each of these individual networks
quite extensively, including telephone networks, postal
network, web communities, and so on. Still, what is of
great interest is how these networks collide and interact with each
other. Each individual has presence in multiple networks and it will
be of interest to understand how the references and presence in one
affects that in the others. Previously such studies would have been
difficult to do since each source of data was often owned by a specific
entity. Blogs now provide a unique, public source of data that
naturally provides visibility into multiple networks. For example,
blog entries cite web pages, blogs have friends and community, as
well as blog profiles that have links to email, IM and other networks.
In this paper, we use the blogs as a starting point to pull in data about
these multiple networks and study how these multiple networks interact
with each other. While much of the connections are within
specific networks, there is still a wealth of information, structure and
form to the connections across these networks as our analyses show.}
  att_authors = {gc2602},
  att_private = {false},
  author = {G. Cormode and S. Muthukrishnan and W. Zhuang},
  title = {Conquering the Divide: 
                   Continuous Clustering of Distributed Data Streams},
  booktitle = {International Conference on Data Engineering ({ICDE})},
  year = {2007},
  url = {../papers/cdclust.pdf},
  abstract = {Data is often collected over a distributed network, but
in many cases, is so voluminous that it is impractical and
undesirable to collect it in a central location. Instead, we
must perform distributed computations over the data, guaranteeing
high quality answers even as new data arrives. In
this paper, we formalize and study the problem of maintaining
a clustering of such distributed data that is continuously
evolving. In particular, our goal is to minimize the communication
and computational cost, still providing guaranteed
accuracy of the clustering.

We focus on the k-center clustering, and provide a suite
of algorithms that vary based on which centralized algorithm
they derive from, and whether they maintain a single
global clustering or many local clusterings that can be
merged together. We show that these algorithms can be designed
to give accuracy guarantees that are close to the best
possible even in the centralized case. In our experiments,
we see clear trends among these algorithms, showing that
the choice of algorithm is crucial, and that we can achieve a
clustering that is as good as the best centralized clustering,
with only a small fraction of the communication required to
collect all the data in a single location.}
  author = {A. Chakrabarti and G. Cormode and A. McGregor},
  att_authors = {gc2602},
  att_private = {false},
  title = {A Near-Optimal Algorithm for Computing the Entropy of 
                     a Stream},
  booktitle = {{ACM}-{SIAM} Symposium on Discrete Algorithms ({SODA})},
  year = {2007},
  url = {../papers/entropy-soda06.pdf},
  abstract = {
  We describe a simple algorithm for computing the empirical entropy of
  a stream of $m$ values in a single pass, using $O(\epsilon^{-2} \log
  (\delta^{-1}) \log^2 m)$ words of space.  Our algorithm is based upon
  a novel extension of a method introduced by Alon, Matias,  and Szegedy. 
  We show a space lower bound of $\Omega({\epsilon^{-2}/\log
  (1/\epsilon)})$, meaning that our algorithm is near optimal in terms of
  its dependency on $\epsilon$.  This improves over previous work on this
  problem.  We show that generalizing to
  $k$th order entropy requires close to linear space, and give additive
  approximations using our algorithm. 
  author = {G. Cormode and S. Muthukrishnan},
  att_authors = {gc2602},
  att_private = {false},
  title = {Combinatorial Algorithms for Compressed Sensing},
  booktitle = {SIROCCO},
  year = {2006},
  url = {../papers/cs-sirocco.pdf},
  slides = {../slides/cs-siroccoslides.pdf},
  pubsite = {},
  note = {},
  abstract = {
In sparse approximation theory, the fundamental problem is to 
signal A in R^n from linear measurements  with respect to a 
dictionary of psi_i's. Recently, there is focus on the novel direction 
Compressed Sensing [Donoho:04] where the reconstruction can be done with 
very few---O(k log n)---linear measurements over a modified dictionary 
the signal is compressible, that is, its information is concentrated in 
coefficients with the original dictionary. In particular, these results 
prove that there exists a single O(k log n) x n measurement matrix such 
that any such signal can be reconstructed from these measurements, with 
error at most O(1) times the worst case error for the class of such 
signals.  Compressed sensing has generated tremendous excitement both 
because of the sophisticated underlying Mathematics and because of its 
potential applications.

In this paper, we address outstanding open problems in Compressed 
Our main result is an explicit construction of a non-adaptive 
matrix and the corresponding reconstruction algorithm
so that with a number of measurements polynomial in k, log n, 1/$epsilon$,
we can reconstruct compressible signals. This is the first known 
polynomial time explicit construction of any such measurement matrix. In 
addition,  our result improves the error guarantee from O(1) to 1 + 
$\epsilon$ and  improves the reconstruction time from poly{n} to poly{k log 

Our second result is a randomized construction of O(k polylog{n}) 
measurements that work for each signal with high probability and gives 
per-instance approximation guarantees rather than over the class of all 
signals. Previous work on Compressed Sensing does not provide such 
per-instance approximation guarantees; our result improves the best 
number of measurements known from prior work in other areas including 
Learning Theory, Streaming algorithms and Complexity Theory for this 

Our approach is combinatorial. In particular, we use two parallel sets 
group tests, one to filter and the other to certify and estimate; the 
resulting algorithms are quite simple to implement.}
  author = {G. Cormode and F. Korn and S. Muthukrishnan and D. Srivastava},
  att_authors = {gc2602,pk1785,ds8961},
  att_private = {false},
  title = {Space- and Time-Efficient Deterministic Algorithms for Biased Quantiles over Data Streams},
  booktitle = {{ACM} Principles of Database Systems ({PODS})},
  year = {2006},
  note = {},
  url = {../papers/bq-pods.pdf},
  slides = {../slides/bquant-short.pdf},
  abstract = {
Skew is prevalent in data streams, and should be taken into account
by algorithms that analyze the data.
The problem of finding ``biased quantiles''--- that is, approximate
quantiles which must be more accurate for more extreme values ---
is a framework for summarizing such skewed data on data streams.
We present the first deterministic algorithms for answering biased
quantiles queries accurately with small---sublinear in the input size---
space and time bounds in one pass.
The space bound is near-optimal, and the amortized update cost is
close to constant,
making it practical for handling high speed network data streams.
We demonstrate that not only can we prove theoretical properties of the
algorithm, but that it also uses less space than existing methods in many
practical settings, and is fast to maintain.}
  author = {G. Cormode and R. Keralapura and J. Ramimirtham},
  att_authors = {gc2602},
  att_private = {false},
  title = {Communication-Efficient Distributed Monitoring of Thresholded Counts},
  booktitle = {{ACM} {SIGMOD} International 
                    Conference on Management of Data ({SIGMOD})},
  url = {../papers/cdthresh.pdf},
  slides = {../slides/cdthresh.ppt},
  note = {},
  year = {2006},
  abstract = {
Monitoring is an issue of primary concern in current and
next generation networked systems. For example, the objective of
sensor networks is to monitor their surroundings for a variety of
different applications like atmospheric conditions, wildlife
behavior, and troop movements among others. Similarly, monitoring in
data networks is critical not only for accounting and management,
but also for detecting anomalies and attacks. Such monitoring
applications are inherently continuous and distributed, and must be
designed to minimize the communication overhead that they introduce.
In this context we introduce and study a
fundamental class of problems called ``thresholded counts'' where we
must return the {\em aggregate} frequency count of an event that is
continuously monitored by distributed nodes with a user-specified
accuracy whenever the actual count exceeds a given threshold value.

In this paper we propose to address the problem of thresholded
counts by setting local thresholds at each monitoring node and
initiating communication only when the locally observed data exceeds
these local thresholds. We explore algorithms in two categories:
{\em static thresholds} and {\em adaptive thresholds}.
In the static case, we consider
thresholds based on a linear combination of two alternate  
strategies, and show that there exists an optimal blend of the two
strategies that results in minimum communication overhead. We
further show that this optimal blend can be found using a steepest
descent search. In the adaptive case, we propose algorithms that
adjust the local thresholds based on the observed distributions of
updated information in the distributed monitoring system. We use
extensive simulations not only to verify the accuracy of our
algorithms and validate our theoretical results, but also to
evaluate the performance of the two approaches. We find that both  
approaches yield significant savings over the naive approach of
performing processing at a centralized location.}
  author = {G. Cormode and M. Garofalakis and D. Sacharidis},
  att_authors = {gc2602},
  att_private = {false},
  title = {Fast Approximate Wavelet Tracking on Streams},
  booktitle = {Extending Database Technology},
  pages = {4--22},
  year = {2006},
  url = {../papers/gc-sketch.pdf},
  note = {},
  link = {},
  abstract = {
Recent years have seen growing interest in effective
algorithms for summarizing and querying massive, high-speed
data streams.
Randomized sketch synopses 
provide accurate approximations for general-purpose summaries of
the streaming data distribution
(e.g., wavelets). 
The focus of existing work has typically been on minimizing 
{\em space requirements} of the maintained synopsis 
--- however, to effectively support high-speed
data-stream analysis, a crucial practical requirement is to also
optimize: (1) the {\em update time} for incorporating a streaming
data element in the sketch, and (2) the {\em query time} for producing
an approximate summary (e.g., the top wavelet coefficients) from the
Such time costs must be small enough to cope with rapid
stream-arrival rates and the real-time querying requirements of typical
streaming applications (e.g., ISP network monitoring).
With cheap and plentiful memory, space is often only a 
secondary concern after query/update time costs.  

In this paper, we propose the first fast solution to
the problem of tracking wavelet representations of
one-dimensional and multi-dimensional data streams, based on a
novel stream synopsis, the {\em Group-Count Sketch (GCS)}.
By imposing a hierarchical structure of groups over the data and
applying the GCS, our algorithms can quickly recover the most
important wavelet coefficients with guaranteed accuracy.
A tradeoff between query time and update time is established, by
varying the hierarchical structure of groups, allowing the right balance
to be found for specific data stream.
Experimental analysis confirms this tradeoff, and shows that all our
methods significantly outperform previously known methods 
in terms of both update time and query time, while
maintaining a high level of accuracy.}
  author = {G. Cormode and S. Muthukrishnan and W. Zhuang},
  title = {What's Different: Distributed, Continuous Monitoring
of Duplicate-Resilient Aggregates on Data Streams},
  att_authors = {gc2602},
  att_private = {false},
  booktitle = {International Conference on Data Engineering ({ICDE})},
  pages = {20--31},
  year = 2006,
  url = {../papers/cdcdicde.pdf},
  link = {},
  doi = {10.1109/ICDE.2006.173},
  note = {},
  abstract = {
Emerging applications in sensor systems and network-wide
IP traffic analysis present many technical challenges. They
need distributed monitoring and continuous tracking of events.
They have severe resource constraints not only at each site in
terms of per-update processing time and archival space for highspeed
streams of observations, but also crucially, communication
constraints for collaborating on the monitoring task. These
elements have been addressed in a series of recent works.
A fundamental issue that arises is that one cannot make the
uniqueness assumption on observed events which is present in
previous works, since widescale monitoring invariably encounters
the same events at different points. For example, within the network
of an Internet Service Provider packets of the same flow will
be observed in different routers; similarly, the same individual
will be observed by multiple mobile sensors in monitoring wild
animals. Aggregates of interest on such distributed environments
must be resilient to duplicate observations.

We study such duplicate-resilient aggregates that measure the
extent of the duplicationhow many unique observations are
there, how many observations are uniqueas well as standard
holistic aggregates such as quantiles and heavy hitters over
the unique items. We present accuracy guaranteed, highly
communication-efficient algorithms for these aggregates that
work within the time and space constraints of high speed streams.
We also present results of a detailed experimental study on both
real-life and synthetic data.}
  author = {G. Cormode and S. Muthukrishnan and I. Rozenbaum},
  att_authors = {gc2602},
  att_private = {false},
  title = {Summarizing and Mining Inverse Distributions on Data 
                    Streams via Dynamic Inverse Sampling},
  booktitle = {International Conference on 
                  Very Large Data Bases ({VLDB})},
  pages = {25--36},
  year = {2005},
  url = {../papers/inversedbnvldb.pdf},
  slides = {../slides/invdbnvldb.pdf},
  note = {},
  abstract = {
Database management systems face the challenge of
dealing with massive data distributions which
arrive at high speeds while there is only small storage
available for managing and mining them.
Emerging data stream management systems approach this
problem by summarizing and mining the distributions
using samples or sketches. However, data distributions
can be ``viewed'' in different ways. For example,
a data stream of integer values can be viewed either
as the {\em forward} distribution $f(x)$, ie., the
number of occurrences of $x$ in the
stream, or as its inverse, $f^{-1}(i)$, which is
the number of items that appear $i$ times.
While both such ``views'' are equivalent in stored
data systems, over data streams that entail approximations,
they may be significantly different. In other words,
samples and sketches developed for the forward distribution
may be ineffective for summarizing or mining the
inverse distribution. Yet, many applications such as
IP traffic monitoring naturally rely on mining inverse

We formalize the problems of managing and mining inverse
distributions and show provable differences between summarizing
the forward distribution vs the inverse distribution.
We present methods for summarizing and mining
inverse distributions of data streams: they rely
on a novel technique to maintain a dynamic sample over the stream with
provable guarantees which can be used for variety of
summarization tasks (building quantiles or equidepth histograms)
and mining (anomaly detection: finding heavy hitters,
and measuring the number of rare items), all with provable
guarantees on quality of approximations and time/space used by
our streaming methods. We also complement our analytical
and algorithmic results by presenting an experimental
study of the methods over network data streams.}
  author = {G. Cormode and M. Garofalakis},
  att_authors = {gc2602},
  att_private = {false},
  title = {Sketching Streams Through the Net:
                  Distributed Approximate Query Tracking},
  booktitle = {International Conference on 
                  Very Large Data Bases ({VLDB})},
  pages = {13--24},
  year = {2005},
  url = {../papers/streamsnet.pdf},
  slides = {},
  abstract = {
While traditional database systems optimize for performance 
on one-shot query processing, emerging large-scale monitoring
applications require continuous tracking of complex data-analysis
queries over collections of physically-distributed streams. 
Thus, effective solutions have to be simultaneously space/time
efficient (at each remote monitor site), communication efficient
(across the underlying communication network), and provide
continuous, guaranteed-quality approximate query answers. 
In this paper, we propose novel algorithmic solutions for the
problem of continuously tracking a broad class of complex 
aggregate queries in such a distributed-streams setting.
Our tracking schemes maintain approximate query answers with
provable error guarantees, while simultaneously optimizing the
storage space and processing time at each remote site, as well
as the communication cost across the network.
In a nutshell, our algorithms rely on tracking general-purpose
randomized sketch summaries of local streams at remote sites
along with concise prediction models of local site behavior
in order to produce highly communication- and space/time-efficient 
The end result is a powerful approximate query tracking framework
that readily incorporates several complex analysis queries 
(including distributed join and multi-join aggregates, and
approximate wavelet representations), thus giving the first
known low-overhead tracking solution for such queries in the
distributed-streams model.
Experiments with real data validate our approach, revealing 
significant savings over naive solutions as well as our
analytical worst-case guarantees.}
  booktitle = {{ACM} Principles of Database Systems ({PODS})},
  author = {G. Cormode and S. Muthukrishnan},
  att_authors = {gc2602},
  att_private = {false},
  title = {Space Efficient Mining of Multigraph Streams},
  pages = {271--282},
  year = {2005},
  url = {../papers/multigraph.pdf},
  slides = {../slides/multigraph-slides.pdf},
  pubsite = {},
  note = {},
  abstract = {
  The challenge of monitoring massive amounts of data generated by
  communication networks has led to the interest in data stream
  We study streams of edges in massive communication multigraphs, defined by
  (source, destination) pairs. 
  The goal is to compute properties of the underlying graph while using small
  space (much smaller than the number of communicants), 
  and to avoid bias introduced because some edges may appear many times,
  while others are seen only once. 
  We give results for three fundamental problems on multigraph degree sequences: 
  estimating frequency moments of degrees, finding the heavy hitter
  degrees, and computing range sums of degree values. 
  In all cases we are able to show space bounds
  for our summarizing algorithms
  that are significantly smaller than storing complete information.
  We use a variety of data stream methods: 
  sketches, sampling, hashing and distinct
  counting, but a common feature is that we use the approach of
  {\em cascaded summaries}: nesting multiple estimation techniques within one
  In our experimental study, we see that such summaries are highly
  effective, enabling massive multigraph streams to be effectively
  summarized to answer queries of interest with high accuracy using only
  a small amount of space. }
  booktitle = {{ACM} {SIGMOD} International 
                    Conference on Management of Data ({SIGMOD})},
  att_authors = {gc2602},
  att_private = {false},
  author = {G. Cormode and M. Garofalakis and S. Muthukrishnan 
                    and R. Rastogi},
  title = {Holistic Aggregates in a Networked World: 
                   Distributed Tracking of Approximate Quantiles},
  year = {2005},
  pages = {25--36},
  url = {../papers/cdquant.pdf},
  slides = {../slides/cdquant-slides.pdf},
  pubsite = {},
  note = {},
  abstract = {
  While traditional database systems optimize for performance on
  one-shot queries, emerging large-scale monitoring applications
  require continuous tracking of  complex aggregates  and
  data-distribution summaries over collections of
  physically-distributed streams.
  Thus, effective solutions have to be simultaneously space efficient
  (at each remote site), communication efficient (across the underlying
  communication network), and provide continuous, guaranteed-quality
  In this paper, we propose novel algorithmic solutions
  for the problem of continuously tracking complex holistic
  aggregates in such a distributed-streams setting ---
  our primary focus is on approximate quantile summaries, but our
  approach is more broadly applicable and can handle other
  holistic-aggregate functions (e.g., ``heavy-hitters'' queries).
  We present the first known distributed-tracking schemes  for
  maintaining accurate quantile estimates  with provable
  approximation guarantees, while simultaneously optimizing
the storage space at each remote site as well as the
communication cost across the network.
In a nutshell, our algorithms employ a combination
of local tracking at remote sites and simple prediction
models for local site behavior in order to produce highly
communication- and space-efficient solutions.
We perform extensive experiments with real and synthetic data
to explore the various tradeoffs and understand the role of
prediction models in our schemes.
The results clearly validate our approach, revealing significant
savings over naive solutions as well as our analytical worst-case
  booktitle = {{SIAM} Conference on Data Mining ({SDM})},
  att_authors = {gc2602},
  att_private = {false},
  author = {G. Cormode and S. Muthukrishnan},
  title = {Summarizing and Mining Skewed Data Streams},
  year = {2005},
  url = {../papers/cmz-sdm.pdf},
  slides = {../slides/cmz-slides.pdf},
  note = {},
  abstract = {
Many applications generate massive data streams.
Summarizing such massive data requires fast, small space algorithms to
support post-hoc queries and mining. 
An important observation is that such streams are rarely uniform, and
real data sources typically exhibit significant skewness. 
These are well modeled by Zipf distributions, which are characterized
by a parameter, $z$, that captures the amount of skew.

We present a data stream summary that can answer point queries with 
$\epsilon$ accuracy and show that the space needed is only
This is the first $o(1/\epsilon)$ space algorithm for this problem,
and we show it is essentially tight for skewed distributions. 
We show that the same data structure can also estimate the $L_2$ norm
of the stream in $o(1/\epsilon^2)$ space for $z>1/2$, another
improvement over the existing $\Omega(1/\epsilon^2)$ methods. 

We support our theoretical results with an experimental study over a
large variety of real and synthetic data. 
We show that significant skew is present in both textual and
telecommunication data.
Our methods give strong accuracy, significantly better than other
methods, and behave exactly in line with their analytic bounds.}
  author = {G. Cormode and F. Korn and S. Muthukrishnan and D. 
  att_authors = {gc2602,ds8961,pk1785},
  att_private = {false},
  title = {Effective Computation of Biased Quantiles over Data 
  year = {2005},
  booktitle = {International Conference on Data Engineering ({ICDE})},
  pages = {20--31},
  url = {../papers/bquant-icde.pdf},
  slides = {../slides/bquant-slides.pdf},
  note = {},
  abstract = {
Skew is prevalent in many data sources such as IP traffic
streams.  To continually summarize the 
distribution of such data, a high-biased set of quantiles
(e.g., 50th, 90th and 99th percentiles) with finer error
guarantees at higher ranks (e.g., errors of 5, 1 and 0.1
percent, respectively) is more useful than uniformly 
distributed quantiles (e.g., 25th, 50th and 75th percentiles)
with uniform error guarantees.
In this paper, we address the following two problems.  
First, can we compute quantiles with finer 
error guarantees for the higher ranks of the data distribution 
effectively, using less space and computation time than 
computing all quantiles uniformly at the finest error?
Second, if {\em specific\/} quantiles and their error bounds 
are requested {\em a priori}, can the necessary space usage and 
computation time be reduced?

We answer both questions in the affirmative by formalizing them as the
``high-biased'' quantiles and the ``targeted'' quantiles problems,
respectively, and presenting algorithms with provable guarantees, that 
significantly better than previously known solutions for these problems.  
We implemented our algorithms in the Gigascope data stream management
system, and evaluated alternate approaches for maintaining the
relevant summary structures.  Our experimental results on real and
synthetic IP data streams complement our theoretical analyses, and
highlight the importance of lightweight, non-blocking implementations
when maintaining summary structures over high-speed data streams.}
  author = {G. Cormode and S. Muthukrishnan},
  att_authors = {gc2602},
  att_private = {false},
  title = {Substring Compression Problems},
  year = {2005},
  pages = {321--330},
  booktitle = {{ACM}-{SIAM} Symposium on Discrete Algorithms ({SODA})},
  url = {../papers/substringsoda.pdf},
  slides = {../slides/substring-slides.pdf},
  note = {},
  abstract = {
We initiate a new class of string matching problems called 
{\em  Substring Compression Problems}.
Given a string $S$ that may be preprocessed, 
the problem is to quickly find the
compressed representation or the compressed size of any query substring of 
(Substring Compression Query or SCQ) or to find the length $l$ 
substring of $S$ 
whose compressed size is the least (Least Compressible Substring or LCS 

Starting from the seminal paper of Lempel and Ziv over 25 years ago,
many different methods have emerged for compressing entire strings.
Determining substring compressibility is a natural variant that is
combinatorially and algorithmically challenging, yet surprisingly
has not been studied before. In addition, compressibility of strings
is emerging as a tool to compare biological sequences and analyze their
information content. However, typically, the compressibility 
of the entire sequence is not as informative as that of portions of
the sequences. Thus substring compressibility may be a more suitable 
basis for sequence analysis.

We present the first known, nearly optimal algorithms for substring 
compression problems---SCQ, LCS and their generaliztions---that 
are exact or provably approximate. 
Our exact algorithms exploit the structure in strings via suffix trees  
our approximate algorithms rely on new relationships we find between 
Lempel-Ziv compression and string parsings. }
  author = {G. Cormode and F. Korn and S. Muthukrishnan and D. Srivastava},
  att_authors = {gc2602,pk1785,ds8961},
  att_private = {false},
  year = {2004},
  title = {Diamond in the Rough: Finding Hierarchical Heavy Hitters in Multi-Dimensional Data},
  booktitle = {{ACM} {SIGMOD} International 
                    Conference on Management of Data ({SIGMOD})},
  pages = {155--166},
  url = {../papers/ckmsh4.pdf},
  pubsite = {},
  note = {},
  abstract = {
  Data items archived in data warehouses or those that arrive online as
streams typically have attributes which take values from multiple
hierarchies (e.g., time and geographic location; source and
destination IP addresses). Providing an aggregate view of such data is
important to summarize, visualize, and analyze. We develop the
aggregate view based on certain hierarchically organized sets of
large-valued regions (``heavy hitters''). Such Hierarchical Heavy
Hitters (HHHs) were previously introduced as a crucial aggregation
technique in one dimension. In order to analyze the wider range of
data warehousing applications and realistic IP data streams, we
generalize this problem to multiple dimensions.

We illustrate and study two variants of HHHs for multi-dimensional
data. In particular, we identify ``overlap'' and ``split'' variants,
depending on how an aggregate computed for a child node in the
multi-dimensional hierarchy is propagated to its parent
element(s). For data warehousing applications, we present offline
algorithms that take multiple passes over the data and produce the
exact HHHs.  For data stream applications, we present online
algorithms that find approximate HHHs in one pass, with proven
accuracy guarantees.

We show experimentally, using real and synthetic data, that our
proposed online algorithms yield outputs which are very similar
(virtually identical, in many cases) to their offline
counterparts. The lattice property of the product of hierarchical
dimensions (``diamond'') is crucially exploited in our online
algorithms to track approximate HHHs using only a small, fixed number
of statistics per candidate node, regardless of the number of
  author = {G. Cormode and F. Korn and S. Muthukrishnan and T. Johnson and O. Spatscheck and D. Srivastava},
  year = {2004},
  title = {Holistic {UDAF}s at streaming speeds},
  pages = {35--46},
  url = {../papers/cjkmss-streamudaf.pdf},
  pubsite = {},
  note = {},
  booktitle = {{ACM} {SIGMOD} International 
                    Conference on Management of Data ({SIGMOD})},
  abstract = {
Many algorithms have been proposed to approximate holistic ag-
gregates, such as quantiles and heavy hitters, over data streams.
However, little work has been done to explore what techniques are
required to incorporate these algorithms in a data stream query pro-
cessor, and to make them useful in practice.
In this paper, we study the performance implications of using
user-dened aggregate functions (UDAFs) to incorporate selection-
based and sketch-based algorithms for holistic aggregates into a
data stream management system's query processing architecture.
We identify key performance bottlenecks and tradeoffs, and pro-
pose novel techniques to make these holistic UDAFs fast and space-
efcient for use in high-speed data stream applications. We evalu-
ate performance using generated and actual IP packet data, focus-
ing on approximating quantiles and heavy hitters. The best of our
current implementations can process streaming queries at OC48
speeds (2x 2.4Gbps).}
  author = {G. Cormode},
  att_authors = {gc2602},
  att_private = {false},
  title = {The Hardness of the Lemmings Game, or {Oh} no, more {NP}-Completeness Proofs},
  url = {../papers/cormodelemmings.pdf},
  link = {Cormode04LemsTR.html},
  pages = {65--76},
  booktitle = {Proceedings of Third International Conference on Fun with Algorithms},
  year = {2004},
  abstract = {
In the computer game `Lemmings', the player must guide a tribe of green-haired
lemming creatures to safety, and save them from an untimely demise. 
We formulate the decision problem which is, given a level of the game, to
decide whether it is possible to complete the level (and hence to find 
a solution to that level).  
Under certain limitations, this can be decided in polynomial
time, but in general the problem is shown to be NP-Hard.  
This can hold even if there
is only a single lemming to save, thus showing that it is hard to
approximate the number of saved lemmings to any factor.}
  author = {G. Cormode and A. Czumaj and S. Muthukrishnan},
  att_authors = {gc2602},
  att_private = {false},
  title = {How to {\em increase} the acceptance ratios of top conferences},
  url = {},
  booktitle = {Proceedings of Third International Conference on Fun with Algorithms},
  year = {2004},
  pages = {262--273},
  link = {CormodeCzumajMuthukrishnan04TR.html},
  abstract = {
 In the beginning was the pub. This work was triggered by a pub conversation where the
authors observed that many resumes list acceptance ratios of conferences where their
papers appear, boasting the low acceptance ratio. The lower the ratio, better your paper
looks. The list might look equally impressive if one listed the rejection ratio of
conferences where ones paper was submitted and rejected. We decided to lampoon
rather than lament the effort the PC typically put in: wouldn't the world be better if we
could encourage only high quality submissions and so run top conferences with very
high acceptance ratios? This paper captures our thoughts, and it is best consumed in a
pub (and in color).}
  author = {G. Cormode and S. Muthukrishnan},
  att_authors = {gc2602},
  att_private = {false},
  title = {What's New: Finding Significant Differences in Network
                     Data Streams},
  booktitle = {Proceedings of {IEEE} {Infocom}},
  note = {},
  year = {2004},
  pages = {1534--1545},
  url = {../papers/whatsnew.pdf},
  link = {CormodeMuthukrishnan05ToN.html},
  pubsite = {},
  slides = {../slides/changes-infocom.pdf},
  abstract = {Monitoring and analyzing network traffic usage patterns 
is vital for managing IP Networks. An important problem is to provide 
network managers with information about changes in traffic, informing them 
about ``what's new''. Specifically, we focus on the challenge of finding 
significantly large differences in traffic: over time, between interfaces 
and between routers. We introduce the idea of a deltoid: an item that has 
a large difference, whether the difference is absolute, relative or 

We present novel algorithms for finding the most significant deltoids in 
high speed traffic data, and prove that they use small space, very small 
time per update, and are guaranteed to find significant deltoids with 
pre-specified accuracy. In experimental evaluation with real network 
traffic, our algorithms perform well and recover almost all deltoids. This 
is the first work to provide solutions capable of working over the data 
with one pass, at network traffic speeds.}
  author = {G. Cormode and S. Muthukrishnan},
  att_authors = {gc2602},
  att_private = {false},
  title = {An Improved Data Stream Summary: The Count-Min Sketch 
                     and its Applications},
  year = {2004},
  booktitle = {Proceedings of Latin American Theoretical Informatics (LATIN)},
  pages = {29-38},
  url = {../papers/cm-latin.pdf},
  note = {},
  slides = {../slides/cmlatintalk.pdf},
  link = {CormodeMuthukrishnan04CMJalg.html},
  pubsite = {},
  abstract = {We introduce a new sublinear space data structure---the Count-Min Sketch--- for summarizing data streams. Our sketch allows fundamental queries in data stream summarization such as point, range, and inner product queries to be approximately answered very quickly; in addition, it can be applied to solve several important problems in data streams such as finding quantiles, frequent items, etc. The time and space bounds we show for using the CM sketch to solve these problems significantly improve those previously known --- typically from $1/\epsilon^2$ to $1/\epsilon$ in factor.

Journal version appeared in Journal of Algorithms.
  author = {G. Cormode},
  att_authors = {gc2602},
  att_private = {false},
  title = {Stable distributions for stream computations: 
                    it's as easy as 0,1,2},
  booktitle = {Workshop on Management and Processing of Massive Data
                     Streams at FCRC},
  year = {2003},
  url = {../papers/stable.pdf},
  slides = {../slides/mpds-stable.pdf},
  abstract = {A surprising number of data stream problems are solved by methods involving computations with stable distributions. This paper will give a short summary of some of these problems, and how the best known solutions depend on use of stable distributions; it also lists some related open problems.}
  title = {What's Hot and What's Not: Tracking Most Frequent Items
  att_authors = {gc2602},
  att_private = {false},
  year = {2003},
  pages = {296-306},
  author = {G. Cormode and S. Muthukrishnan},
  booktitle = {{ACM} Principles of Database Systems ({PODS})},
  note = {},
  url = {../papers/hotitems.pdf},
  slides = {../slides/whatshot-pods.pdf},
  pubsite = {},
  abstract = {Most database management systems maintain statistics on the underlying relation. One of the important statistics is that of the 'hot items' in the relation: those that appear many times (most frequently, or more than some threshold). For example, end-biased histograms keep the hot items as part of the histogram and are used in selectivity estimation. Hot items are used as simple outliers in data mining, and in anomaly detection in networking applications. We present a new algorithm for dynamically determining the hot items at any time in the relation that is undergoing deletion operations as well as inserts. Our algorithm maintains a small space data structure that monitors the transactions on the relation, and when required, quickly outputs all hot items, without rescanning the relation in the database. With user-specified probability, it is able to report all hot items. Our algorithm relies on the idea of ``group testing'', is simple to implement, and has provable quality, space and time guarantees. Previously known algorithms for this problem that make similar quality and performance guarantees can not handle deletions, and those that handle deletions can not make similar guarantees without rescanning the database. Our experiments with real and synthetic data shows that our algorithm is remarkably accurate in dynamically tracking the hot items independent of the rate of insertions and deletions.}
  title = {Finding Hierarchical Heavy Hitters in Data Streams},
  att_authors = {gc2602,ds8961,pk1785},
  att_private = {false},
  year = {2003},
  pages = {464-475},
  author = {G. Cormode and F. Korn and S. Muthukrishnan and D. Srivastava},
  booktitle = {International Conference on 
                  Very Large Data Bases ({VLDB})},
  url = {../papers/ckms-hhh.pdf},
  note = {},
  link = {},
  abstract = {Aggregation along hierarchies is a critical summary technique in a large variety of online applications including decision support, and network management (e.g., IP clustering, denial-of-service attack monitoring). Despite the amount of recent study that has been dedicated to online aggregation on sets (e.g., quantiles, hot items), surprisingly little attention has been paid to summarizing hierarchical structure in stream data.

The problem we study in this paper is that of finding Hierarchical Heavy Hitters (HHH): given a hierarchy and a fraction phi, we want to find all HHH nodes that have a total number of descendants in the data stream larger than phi of the total number of elements in the data stream, after discounting the descendant nodes that are HHH nodes. The resulting summary gives a topological 'cartogram' of the hierarchical data. We present deterministic and randomized algorithms for finding HHHs, which builds upon existing techniques by incorporating the hierarchy into the algorithms. Our experiments demonstrate several factors of improvement in accuracy over the straightforward approach, which is due to making algorithms hierarchy-aware}
  author = {G. Cormode and S. Muthukrishnan},
  att_authors = {gc2602},
  att_private = {false},
  year = {2003},
  booktitle = {European Symposium on Algorithms},
  title = {Estimating Dominance Norms of Multiple Data Streams},
  series = {LNCS},
  note = {},
  volume = {2838},
  url = {../papers/dominance.pdf},
  slides = {../slides/esadom.pdf},
  link = {CormodeMuthukrishnan02DomTR.html},
  pubsite = {},
  abstract = {
There is much focus in the algorithms and database communities on
designing tools to manage and mine data streams.
Typically, data streams consist of multiple signals.
Formally, a stream of multiple signals is $(i,a_{i,j})$
where $i$'s correspond to the domain, $j$'s index the different
signals and $a_{i,j}\geq 0$ give the value of the $j$th signal at
point $i$.
We study the problem of finding norms that are cumulative of
the multiple signals in the data stream.

For example, consider the max-dominance norm,
defined as $\Sigma_i \max_j \{a_{i,j}\}$.
It may be thought as estimating the norm of the  ``upper
envelope'' of the multiple signals, or alternatively, as
estimating the norm of the ``marginal'' distribution of tabular data streams.
It is used in applications to estimate the ``worst case influence'' of
multiple processes, for example in IP traffic analysis, electrical
grid monitoring and
financial domain.
In addition, it is a natural measure, generalizing the union of data
streams or counting distinct elements in data streams.

We present the first known data stream algorithms for estimating
max-dominance of multiple signals.
In particular, we use workspace and time-per-item that are both sublinear (in
fact, poly-logarithmic)  in the input size.  In contrast
notions of dominance on streams $a$, $b$ --- min-dominance
($\Sigma_i \min_j \{a_{i,j}\}$),
count-dominance ($|\{i | a_i > b_i\}|$) or relative-dominance
($\Sigma_i a_i/\max\{1,b_i\}$ ) --- are all impossible to estimate
accurately with sublinear space.}
  author = {G. Cormode and M. Datar and P. Indyk and 
                   S. Muthukrishnan},
  att_authors = {gc2602},
  att_private = {false},
  title = {Comparing Data Streams Using {H}amming Norms},
  year = {2002},
  pages = {335--345},
  booktitle = {International Conference on 
                  Very Large Data Bases ({VLDB})},
  note = {},
  url = {../papers/hammingnorm.pdf},
  slides = {../slides/vldb02.pdf},
  link = {CormodeDatarIndykMuthukrishnan03.html},
  pubsite = {},
  abstract = {
Massive data streams are now fundamental to many data processing
applications.  For example, Internet routers produce large scale
diagnostic data streams.  Such streams are rarely stored in
traditional databases, and instead must be processed ``on the fly'' as
they are produced.  Similarly, sensor networks produce multiple data
streams of observations from their sensors.  There is growing focus on
manipulating  data streams, and hence, there is a need to identify
basic operations of interest in managing data streams, and to support
them efficiently.
We propose computation of the Hamming norm as a basic operation of
interest.  The Hamming norm formalizes ideas
that are used throughout data processing.
When applied to a single stream, the Hamming norm gives the number of distinct 
items that are
present in that data stream, which is a statistic of great interest in
databases.  When applied to a pair of streams, the Hamming norm gives
an important measure of (dis)similarity:  the number of unequal item
counts in the two streams. Hamming norms have many uses
in comparing data streams.

We present a novel approximation technique for estimating the Hamming norm
for massive data streams; this relies on what we call the ``$l_0$
{\it sketch}''
and we prove its accuracy. We test our approximation method on a large
quantity of synthetic and real stream data, and show that the
estimation is accurate to within a few percentage points. 

  Journal version 
                 in {\em {IEEE} Transactions on Knowledge and Data
                 Engineering} 15(3):529--541, 2003}
  author = {G. Cormode and P. Indyk and N. Koudas and 
                  S. Muthukrishnan},
  att_authors = {gc2602},
  att_private = {false},
  title = {Fast Mining of Tabular Data via 
                  Approximate Distance Computations},
  year = {2002},
  pages = {605--616},
  booktitle = {International Conference on Data Engineering ({ICDE})},
  note = {},
  url = {../papers/tabular.pdf},
  slides = {../slides/icdeslides.pdf},
  pubsite = {},
  abstract = {
Tabular data abound in many  data  stores: traditional
relational databases store tables, and
new applications also generate massive tabular
For example, consider the geographic distribution of cell
phone traffic at different  base stations across the
(which can be  represented by a table indexed by
  latitude and longitude),
or the evolution of traffic at Internet
routers over time
(which can be represented by a table
indexed by time, source and destination IP addresses).
Detecting similarity patterns in such data
sets (e.g., which geographic regions have similar cell phone
usage  distribution,  which  IP subnet traffic distributions
over time intervals are similar,  etc) is of great importance.
Identification   of  such  patterns  poses  many
conceptual challenges (what is a  suitable  similarity
distance  function  for two ``regions'') as well as
technical challenges (how to perform similarity computations
efficiently as massive tables get accumulated over time) that
we address.

We  present methods  for  determining
similar regions in massive tabular data.
Our methods are for computing the ``distance'' between
any two subregions of a tabular data: they are approximate,
but  highly  accurate as we prove mathematically, and they are
fast, running in time nearly linear in the table size. Our methods
are general since these distance computations can be
applied to any  mining or
similarity   algorithms that use $L_p$ norms.
A novelty of our distance computation procedures is that they
work for any $L_p$ norms --- not only the traditional
$p=2$  or  $p=1$,  but for all $p\leq 2$; the choice of $p$,
say {\em fractional} $p$,
provides an interesting alternative similarity behavior!
We use  our algorithms in  a  detailed
experimental study of the clustering patterns in  real  tabular
data obtained  from  one  of  AT\&T's  data stores and show that our
methods  are  substantially  faster   than   straightforward
methods  while remaining highly accurate, and able to detect
interesting patterns by varying the value of $p$.}
  author = {G. Cormode and S. Muthukrishnan},
  att_authors = {gc2602},
  att_private = {false},
  title = {The String Edit Distance Matching Problem with Moves},
  year = {2002},
  booktitle = {{ACM}-{SIAM} Symposium on Discrete Algorithms ({SODA})},
  pages = {667--676},
  note = {},
  url = {../papers/editmoves.pdf},
  slides = {../slides/soda2k2.pdf},
  link = {CormodeMuthukrishnan01EditTR.html},
  pubsite = {},
  abstract = {
The edit distance between two strings $S$ and $R$
is defined to be the minimum number of character inserts,
deletes and changes needed to convert $R$ to $S$.
Given a text string $t$ of length $n$, and a pattern string $p$
of length $m$, informally, the string edit distance matching problem is to
compute the smallest edit distance between $p$ and
substrings of $t$. A well known dynamic programming algorithm
takes time $O(nm)$ to solve this problem, and it is an important
open problem in Combinatorial Pattern Matching to
significantly improve this bound.

We relax the problem
so that (a) we allow an additional operation, namely,
{\em substring moves}, and (b) we approximate the string edit distance
upto a factor of $O(\log n \log^* n)$. ($\log^* n$ is the
number of times $\log$ function is applied to $n$ to produce a constant.)
Our result is a near linear time deterministic
algorithm for this version of the problem.
This is the first
known significantly subquadratic algorithm for a string
edit distance problem in which the distance involves
nontrivial alignments.
Our results are obtained by embedding strings into $L_1$
vector space using a simplified parsing technique we call
{\em Edit Sensitive Parsing} (ESP). This embedding is
approximately distance preserving, and we show many applications
of this embedding to string proximity problems including nearest neighbors,
outliers, and streaming computations with strings.
  author = {G. Cormode and S Muthukrishnan 
                   and S. C. {{S}}ahinalp},
  att_authors = {gc2602},
  att_private = {false},
  title = {Permutation Editing and Matching via Embeddings},
  booktitle = {International Colloquium on Automata, 
                  Languages and Programming ({ICALP})},
  note = {},
  volume = {2076},
  year = {2001},
  pages = {481--492},
  url = {../papers/cms-perm.pdf},
  slides = {../slides/icalp.pdf},
  pubsite = {},
  abstract = {
If the genetic maps of two species are modelled as permutations of
 (homologous) genes, the number of chromosomal rearrangements
 in the form of deletions, block moves, inversions etc. to transform one
 such permutation to another can be used as a measure of their evolutionary
 distance. Motivated by such scenarios in Computational Biology and
elsewhere, we study problems of
computing distances between permutations
 as well as matching permutations in
 sequences, and finding most similar permutation from a collection
 (``nearest neighbor'').

We adopt a
 general approach: embed permutation distances of relevance into well-known
 vector spaces in an approximately distance-preserving manner,
 and solve the resulting problems on the well-known spaces.
Our results are as follows:
We present the first known approximately distance preserving
embeddings of these permutation distances into well-known spaces.
Using these embeddings, we obtain several results, including
the first known efficient
 solution for approximately solving nearest neighbor
 problems with permutations
the first known
 algorithms for finding permutation distances in the ``data stream''
We consider a novel
 class of problems called {\em permutation matching} problems
which are similar to string matching problems,
except that the
pattern is a permutation (rather than a string)
and present linear or near-linear time
algorithms for approximately solving permutation matching problems;
in contrast, the corresponding string problems take significantly
  author = {G. Cormode and M. Paterson and S. C.
                   {{S}}ahinalp and U. Vishkin},
  att_authors = {gc2602},
  att_private = {false},
  title = {Communication Complexity of Document Exchange},
  booktitle = {{ACM}-{SIAM} Symposium on Discrete Algorithms ({SODA})},
  note = {},
  year = {2000},
  pages = {197--206},
  url = {../papers/docexchange.pdf},
  link = {},
  slides = {../slides/soda.pdf},
  abstract = {
We have two users, $A$ and $B$, who hold documents $x$ and $y$ respectively.
Neither of the users has any information about the other's document.
They exchange messages
 so that $B$ computes $x$; it may be required that $A$ compute
 $y$ as well.
Our goal is to design communication protocols with
 the main objective of minimizing the total number of bits they exchange;
 other objectives are
 minimizing the number of rounds and the complexity of internal
An important notion which determines the efficiency of the
 protocols is how one measures the distance between $x$ and $y$.
We consider several metrics for measuring this distance,
 namely the Hamming metric, the Levenshtein metric (edit distance),
 and a new LZ metric, which is introduced in this paper.
We show how to estimate the distance between $x$ and $y$ using a
 single message of logarithmic size.
For each metric, we present the first
 communication-efficient protocols, which often
 match the corresponding lower bounds.
A consequence of these are error-correcting codes for these error
 models which correct up to $d$ errors in $n$ characters using
 $O(d \log n)$ bits.
Our most interesting methods use a new
 {\em histogram transformation} that we introduce to convert
 edit distance to $L_1$ distance.}
  author = {G. Ozsoyoglu and N. H. Balkir and
                  G. Cormode and  Z. M. Ozsoyoglu},
  att_authors = {gc2602},
  att_private = {false},
  title = {Electronic Books in Digital Libraries},
  year = {2000},
  booktitle = {Proceedings of IEEE Advances in Digital Libraries (ADL)},
  note = {},
  pages = {5--14},
  url = {../papers/electronicbooks.pdf},
  link = {OzsoyogluCormodeBalkirOzsoyoglu04.html},
  pubsite = {},
  abstract = {
Electronic Book is an application with a multimedia database of instructional resources, which include hyperlinked text, instructor's audio/video clips, slides, animation, still images, etc. as well as content-based infomration about these data, and metadata such as annotations, tags, and cross-referencing information. Electronic books in the Internet or on CDs today are not easy to learn from. We propose the use of a multimedia database of instructional resources in constructing and delivering multimedia lessons about topics in an electronic book.

We introduce an electronic book data model containing (a) topic objects and and (b) instructional resources, called instruction modules which are multimedia presentations possibly capturing real-life lectures of instructors. We use the notion of topic prerequisites for topics at different detail levels, to allow electronic book users to request / compose multimedia lessons about topics on the electronic book. We present automated construction of the ''best'' user-tailored lesson (as a multimedia presentation). }

This file was generated by bibtex2html 1.92.