@comment{{This file has been generated by bib2bib 1.92}}
@comment{{Command line: C:\alldata\data\webbib\bib2bib-1.92.exe -oc article -ob journal.bib -c '$type = "ARTICLE"' cormode.bib}}
  author = {Karthik Prasad and
               Sayan Ghosh and
               Graham Cormode and
               Ilya Mironov and
               Ashkan Yousefpour and
               Pierre Stock},
  title = {Reconciling Security and Communication Efficiency in Federated Learning},
  year = {2023},
  journal = ieeedeb,
  volume = 46,
  number = 1,
  pubsite = {},
  url = {../papers/efficient_fl.pdf},
  abstract = {Cross-device Federated Learning is an increasingly popular machine learning setting to train a model by leveraging 
a large population of client devices with high privacy and security guarantees. 
communication efficiency remains a major bottleneck when scaling federated learning to production environments, particularly due to bandwidth constraints during uplink communication.
In this paper, we formalize and address the problem of compressing client-to-server model updates
under the Secure Aggregation primitive, a core component of Federated Learning pipelines that allows the server to aggregate the client updates without accessing them individually. 
In particular, we adapt standard scalar quantization and pruning methods
to Secure Aggregation and propose Secure Indexing, a variant of Secure Aggregation that supports quantization for extreme compression.
We establish state-of-the-art results on LEAF benchmarks in a secure Federated Learning setup with up to 40$\times$ compression in uplink communication 
with no meaningful loss in utility compared to uncompressed baselines.}
  author = {Graham Cormode and Zohar Karnin and Edo Liberty and Justin Thaler and Vesel{\'{y}}},
  title = {Relative Error Streaming Quantiles},
  journal = {Journal of the ACM ({JACM})},
  volume = 70,
  number = 5,
  pages = {1--48},
  year = 2023,
  url = {../papers/relative_error_quantiles-JACM-final.pdf}
  author = {Akash Bharadwaj and Graham Cormode},
  title = {Federated computation: a survey of concepts and challenges},
  doi = {},
  journal = {Distributed and Parallel Databases},
  year = 2023,
  pubsite = {},
  url = {../papers/federatedcomputation.pdf},
  abstract = {
        Federated Computation is an emerging area that seeks to provide stronger privacy for user data, by performing large scale, distributed computations where the data remains in the hands of users. Only the necessary summary information is shared, and additional security and privacy tools can be employed to provide strong guarantees of secrecy. The most prominent application of federated computation is in training machine learning models (federated learning), but many additional applications are emerging, more broadly relevant to data management and querying data. This survey gives an overview of federated computation models and algorithms. It includes an introduction to security and privacy techniques and guarantees, and shows how they can be applied to solve a variety of distributed computations providing statistics and insights to distributed data. It also discusses the issues that arise when implementing systems to support federated computation, and open problems for future research.
  author = {Graham Cormode and Zohar Karnin and Edo Liberty and Justin Thaler and Vesel{\'{y}}},
  title = {Relative Error Streaming Quantiles},
  year = 2022,
  journal = {SIGMOD Record},
  volume = 51,
  number = 1,
  month = mar,
  pages = {66--79},
  link = {},
  abstract = {Estimating ranks, quantiles, and distributions over streaming data is a central task in data analysis and monitoring.
Given a stream of $n$ items from a data universe equipped
with a total order, the task is to compute a sketch (data
structure) of size polylogarithmic in $n$. Given the sketch and
a query item $y$, one should be able to approximate its rank
in the stream, i.e., the number of stream elements smaller
than or equal to $y$.
Most works to date focused on additive $\epsilon n$ error approximation, culminating in the KLL sketch that achieved optimal asymptotic behavior. This paper investigates multiplicative $(1\pm \epsilon)$-error approximations to the rank. Practical motivation for multiplicative error stems from demands to understand the tails of distributions, and hence for sketches to be more accurate near extreme values.
The most space-efficient algorithms due to prior work store either $O(\log(\epsilon^2n)/\epsilon^2)$ or $O(\log^3(\epsilon n)/\epsilon)$ universe items. We present a randomized sketch storing $O(\log^{1.5}(\epsilon n)/\epsilon)$ items,
which is within an $O(\sqrt{\log(\epsilon n)})$ factor of optimal. Our algorithm does not require prior knowledge of the stream length and is fully mergeable, rendering it suitable for parallel and distributed computing environments.}
  author = {Peter Kairouz and
               H. Brendan McMahan and
               Brendan Avent and
               Aur{\'{e}}lien Bellet and
               Mehdi Bennis and
               Arjun Nitin Bhagoji and
               Kallista A. Bonawitz and
               Zachary Charles and
               Graham Cormode and
               Rachel Cummings and
               Rafael G. L. D'Oliveira and
               Hubert Eichner and
               Salim El Rouayheb and
               David Evans and
               Josh Gardner and
               Zachary Garrett and
               Adri{\`{a}} Gasc{\'{o}}n and
               Badih Ghazi and
               Phillip B. Gibbons and
               Marco Gruteser and
               Za{\"{\i}}d Harchaoui and
               Chaoyang He and
               Lie He and
               Zhouyuan Huo and
               Ben Hutchinson and
               Justin Hsu and
               Martin Jaggi and
               Tara Javidi and
               Gauri Joshi and
               Mikhail Khodak and
               Jakub Kone{\v{c}}n{\'y} and
               Aleksandra Korolova and
               Farinaz Koushanfar and
               Sanmi Koyejo and
               Tancr{\`{e}}de Lepoint and
               Yang Liu and
               Prateek Mittal and
               Mehryar Mohri and
               Richard Nock and
               Ayfer {\"{O}}zg{\"{u}}r and
               Rasmus Pagh and
               Hang Qi and
               Daniel Ramage and
               Ramesh Raskar and
               Mariana Raykova and
               Dawn Song and
               Weikang Song and
               Sebastian U. Stich and
               Ziteng Sun and
               Ananda Theertha Suresh and
               Florian Tram{\`{e}}r and
               Praneeth Vepakomma and
               Jianyu Wang and
               Li Xiong and
               Zheng Xu and
               Qiang Yang and
               Felix X. Yu and
               Han Yu and
               Sen Zhao},
  title = {Advances and Open Problems in Federated Learning},
  journal = {Foundations and Trends in Machine Learning},
  volume = {14},
  number = {1-2},
  pages = {1--210},
  year = {2021},
  url = {},
  doi = {10.1561/2200000083},
  pubsite = {},
  link = {}
  author = {Graham Cormode},
  title = {Current Trends in Data Summaries},
  year = {2022},
  issue_date = {December 2021},
  publisher = {Association for Computing Machinery},
  volume = {50},
  number = {4},
  issn = {0163-5808},
  doi = {10.1145/3516431.3516433},
  abstract = {The research area of data summarization seeks to find small data structures that can be updated flexibly, and answer certain queries on the input accurately. Summaries are widely used across the area of data management, and are studied from both theoretical and practical perspectives. They are the subject of ongoing research to improve their performance and broaden their applicability. In this column, recent developments in data summarization are surveyed, with the intent of inspiring further advances.},
  journal = {SIGMOD Record},
  month = jan,
  pages = {615},
  numpages = {10},
  url = {../papers/summaries2021.pdf}
  title = {Aggregation and Transformation of Vector-Valued Messages in the Shuffle Model of Differential Privacy},
  author = {Graham Cormode and Carsten Maple and Mary Scott},
  journal = {{IEEE} Trans. Inf. Forensics Secur.},
  volume = {17},
  pages = {612--627},
  year = {2022},
  url = {../papers/tifs22.pdf},
  link = {},
  doi = {10.1109/TIFS.2022.3147643},
  abstract = {Advances in communications, storage and computational
technology allow significant quantities of data to be
collected and processed by distributed devices. Combining the
information from these endpoints can realize significant societal
benefit but presents challenges in protecting the privacy of individuals,
especially important in an increasingly regulated world.
Differential privacy (DP) is a technique that provides a rigorous
and provable privacy guarantee for aggregation and release. The
Shuffle Model for DP has been introduced to overcome challenges
regarding the accuracy of local-DP algorithms and the privacy
risks of central-DP. In this work we introduce a new protocol
for vector aggregation in the context of the Shuffle Model. The
aim of this paper is twofold; first, we provide a single message
protocol for the summation of real vectors in the Shuffle Model,
using advanced composition results. Secondly, we provide an
improvement on the bound on the error achieved through using
this protocol through the implementation of a Discrete Fourier
Transform, thereby minimizing the initial error at the expense of
the loss in accuracy through the transformation itself. This work
will further the exploration of more sophisticated structures such
as matrices and higher-dimensional tensors in this context, both
of which are reliant on the functionality of the vector case.}
  author = {Kook Jin Ahn and
               Graham Cormode and
               Sudipto Guha and
               Andrew McGregor and
               Anthony Wirth},
  title = {Correlation Clustering in Data Streams},
  pubsite = {},
  url = {../papers/corrclusteringalgorithmica.pdf},
  journal = {Algorithmica},
  volume = 83,
  number = 7,
  pages = {1980-2017},
  doi = {10.1007/s00453-021-00816-9},
  year = {2021},
  abstract = {
Clustering is a fundamental tool for analyzing large data sets. A
rich body of work has been devoted to designing data-stream algorithms
for the relevant optimization problems such as $k$-center, $k$-median,
and $k$-means. Such algorithms need to be both time  and and space efficient. 
In this paper, we address the problem of \emph{correlation clustering} in the dynamic data stream
model. The stream consists of updates to the edge weights of a graph
on~$n$ nodes and the goal is to find a node-partition such that the
end-points of negative-weight edges are typically in different
clusters whereas the end-points of positive-weight edges are typically
in the same cluster. We present polynomial-time, $O(n\cdot {polylog}
n)$-space approximation algorithms for natural problems that arise.

We first develop data structures based on linear sketches that allow
the ``quality'' of a given node-partition to be measured. We then 
combine these data structures with convex programming and
sampling techniques to solve the relevant approximation problem.
Unfortunately, the standard LP and SDP formulations are not
solvable in $O(n\cdot {polylog} n)$-space. Our work presents
space-efficient algorithms for the convex programming required, as well as
approaches to reduce the adaptivity of the sampling.}
  author = {Graham Cormode and Pavel Vesel{\'{y}}},
  journal = {Theory of Computing Systems},
  title = {Streaming Algorithms for Bin Packing and Vector Scheduling},
  volume = 65,
  number = 6,
  pages = {916-942},
  doi = {10.1007/s00224-020-10011-y},
  year = 2021,
  url = {../papers/bin_packing-tocs-rev1.pdf},
  abstract = {
  Problems involving the efficient arrangement of simple objects, as captured by bin
  packing and makespan scheduling,
  are fundamental tasks in combinatorial optimization.
  These are well understood in the traditional online and offline cases,
  but have been less well-studied when the volume of the input is truly
  massive, and cannot even be read into memory.
  This is captured by the streaming model of computation, where the aim
  is to approximate the cost of the solution in one pass over the data,
  using small space. As a result, streaming algorithms produce
  concise input summaries that approximately preserve the optimum value.
  We design the first efficient streaming algorithms
  for these fundamental problems in combinatorial optimization.
  For \textsc{Bin Packing}, we provide a streaming asymptotic $(1+\epsilon)$-approximation
  with ${O}\sim(1/\epsilon)$ memory,
  where ${O}\sim$ hides logarithmic factors.
  Moreover, such a space bound is essentially optimal. 
  Our algorithm implies a streaming $(d+\epsilon)$-approximation for \textsc{Vector Bin Packing}
  in $d$ dimensions, running in space ${O}\sim(d/\epsilon)$.
  For the related \textsc{Vector Scheduling} problem, we show how to construct
  an input summary in space ${O}\sim(d^2\cdot m / \epsilon^2)$ that preserves
  the optimum value up to a factor of $2 - \frac{1}{m} +\epsilon$,
  where $m$ is the number of identical machines.}
  title = {Constrained Private Mechanisms for Count Data},
  author = {Graham Cormode and Tejas Kulkarni and Divesh Srivastava},
  year = 2021,
  journal = {{IEEE} Transactions on Knowledge and Data Engineering},
  volume = 33,
  number = 2,
  month = feb,
  pages = {415--430},
  link = {},
  url = {../papers/constrained-tkde.pdf},
  abstract = {Concern about how to aggregate sensitive user data without compromising
individual privacy is a major barrier to greater availability of
Differential privacy has emerged as an accepted model to
release sensitive information while giving a statistical guarantee for
Many different algorithms are possible to address different target
We focus on the core problem of count queries, and seek to design
{\em mechanisms} to release data associated with a group of $n$
Prior work has focused on designing mechanisms by raw optimization of a
loss function, without regard to the consequences on the results. 
This can leads to mechanisms with undesirable properties, such as
never reporting some outputs (gaps), and overreporting others
We tame these pathological behaviors by introducing a set of desirable properties that
mechanisms can obey.
Any combination of these can be satisfied by solving a linear program (LP)
which minimizes a cost function, with constraints enforcing the properties.
We focus on a particular cost function, and 
provide explicit constructions that are optimal for certain
combinations of properties, and show a closed form for their cost.
In the end, there are only a handful of distinct optimal mechanisms to choose
between: one is the well-known (truncated) geometric mechanism; the
second a novel mechanism that we introduce here, and the remainder are
found as the solution to particular LPs.
These all avoid the bad behaviors we identify. 
We demonstrate in a set of experiments on real and
synthetic data which is preferable in practice, for different
combinations of data distributions, constraints, and privacy parameters.}
  title = {Verifiable Stream Computation and {Arthur}--{Merlin} Communication},
  author = {Amit Chakrabarti and Graham Cormode and Andrew McGregor and Justin Thaler and Suresh Venktatasubramanian},
  year = 2019,
  journal = {{SIAM} Journal on Computing ({SICOMP})},
  pubsite = {},
  url = {../papers/vscamc.pdf},
  abstract = {In the setting of streaming interactive proofs (SIPs), a client (verifier)
needs to compute a given function on a massive stream of data, arriving
online, but is unable to store even a small fraction of the data. It
outsources the processing to a third party service (prover), but is unwilling
to blindly trust answers returned by this service. Thus, the service cannot
simply supply the desired answer; it must {\em convince} the verifier of its
correctness via a short interaction after the stream has been seen.

In this work we study ``barely interactive'' SIPs. Specifically, we show that
one or two rounds of interaction suffice to solve several query
problems---including Index, Median, Nearest Neighbor Search, Pattern Matching,
and Range Counting---with polylogarithmic space and communication costs.  Such
efficiency with $O(1)$ rounds of interaction was thought to be impossible
based on previous work.

On the other hand, we initiate a formal study of the limitations of
constant-round SIPs by introducing a new hierarchy of communication models
called Online Interactive Proofs (OIPs). The online nature of these models is
analogous to the streaming restriction placed upon the verifier in a SIP. We
give upper and lower bounds that (1) characterize, up to quadratic blowups,
every finite level of the OIP hierarchy in terms of other well-known
communication complexity classes, (2) separate the first four levels of the
hierarchy, and (3) reveal that the hierarchy collapses to the fourth level.
Our study of OIPs reveals marked contrasts and some parallels with the classic
Turing Machine theory of interactive proofs, establishes limits on the power
of existing techniques for developing constant-round SIPs, and provides a new
characterization of (non-online) Arthur--Merlin communication in terms of an
online model.
  title = {$L_p$ Samplers and Their Applications: A Survey},
  author = {Graham Cormode and Hossein Jowhari},
  journal = {ACM Computing Surveys},
  url = {../papers/Lpsurvey.pdf},
  year = 2018,
  abstract = {The notion of $L_p$ sampling, and corresponding algorithms known as $L_p$ samplers, have found a wide range of applications in the design of
data stream algorithms and beyond.
In this survey we present some of the core algorithms to achieve
  this sampling distribution based on ideas from hashing, sampling and
We give results for the special cases of insertion-only
inputs, lower bounds for the sampling problems, and ways to
efficiently sample multiple elements.
We describe a range of applications of 
$L_p$ drawing on problems across the domain of computer science,
from matrix and graph computations, as well as to geometric and vector
streaming problems.}
  author = {Graham Cormode and Anirban Dasgupta and Amit Goyal and Chi Hoon Lee},
  title = {An Evaluation of Multi-Probe Locality Sensitive Hashing for Computing Similarities over Web-Scale Query Logs},
  journal = {PLOS ONE},
  doi = {10.1371/journal.pone.0191175},
  volume = 13,
  number = 1,
  pages = {e0191175},
  year = 2018,
  url = {../papers/multiprobelsh.pdf},
  abstract = {
    Many modern applications of AI such as web search, mobile
    browsing, image processing, and natural language processing 
    rely on finding similar items from a large database of complex
    Due to the very large scale of data involved 
    (e.g., users' queries from commercial search engines),
    computing such near or nearest neighbors 
    is a non-trivial task, as the computational cost grows 
    significantly with the number of items.
    To address this challenge, we adopt 
    Locality Sensitive Hashing (a.k.a, LSH) methods and evaluate 
    four variants in a distributed computing environment (specifically, Hadoop). 
    We identify several optimizations which improve performance, suitable
    for deployment in very large scale settings. 
    The experimental results demonstrate our 
    variants of LSH achieve the robust performance with better
    recall compared with ``vanilla'' LSH, even
    when using the same amount of space.}
  author = {Jun Zhang and Graham Cormode and Magda Procopiuc and Divesh Srivastava and Xiaokui Xiao},
  title = {PrivBayes: Private Data Release via Bayesian Networks},
  journal = {{ACM} Transactions on Database Systems},
  year = 2017,
  url = {../papers/privbayes-tods.pdf},
  abstract = {
  Privacy-preserving data publishing is an important problem that has
been the focus of extensive study.
The state-of-the-art solution for this problem is differential privacy,
which offers a strong degree of privacy protection without making
restrictive assumptions about the adversary.
Existing techniques using differential privacy, however, cannot
effectively handle the publication of high-dimensional data.
In particular, when the input dataset contains a large number of
attributes, existing methods require injecting a prohibitive amount of
noise compared to the signal in the data, which renders the
published data next to useless.

To address the deficiency of the existing methods, this paper presents
PrivBayes, a differentially private method for releasing
high-dimensional data.
Given a dataset $D$, PrivBayes first constructs a Bayesian network
$\mathcal{N}$, which (i) provides a succinct model of the correlations
among the attributes in $D$ and (ii) allows us to approximate the
distribution of data in $D$ using a set $\mathcal{P}$ of
low-dimensional marginals of $D$.
After that, PrivBayes injects noise
into each marginal in $\mathcal{P}$ to ensure differential privacy,
and then uses the noisy marginals and the Bayesian network to
construct an approximation of the data distribution in $D$. Finally,
PrivBayes samples tuples from the approximate distribution to
construct a synthetic dataset, and then releases the synthetic
data. Intuitively, PrivBayes circumvents the curse of dimensionality,
as it injects noise into the low-dimensional marginals in
$\mathcal{P}$ instead of the high-dimensional dataset $D$.
Private construction of Bayesian networks turns out to be
significantly challenging, and we introduce a novel approach that uses
a surrogate function for mutual information to build the model more
accurately. We experimentally evaluate PrivBayes on real data, and demonstrate that it significantly outperforms existing solutions in terms of accuracy.
  author = {Graham Cormode},
  title = {Data Sketching},
  journal = {Communications of the {ACM} ({CACM})},
  volume = 60,
  number = 9,
  pages = {48--55},
  year = 2017,
  url = {../papers/cacm-sketch.pdf},
  pubsite = {},
  abstract = {Do you ever feel overwhelmed by a constant stream of information?  It
can seem like there is a barrage of new email and text messages
arriving, phone calls, articles to read, and knocks on the door.
Putting these pieces together to keep track of what's important can be a real challenge. 

The same information overload is of concern in many computational settings. 
For example, telecommunications companies want to keep track of the activity on their network, to identify the overall network health, and spot anomalies or changes in behavior.  Yet, the scale of events occurring is huge: many millions of network events per hour, per network element. 
And while new technologies allow the scale and granularity of events being monitored to increase by orders of magnitude, the capacity of computing elements to make sense of these (processors, memory and disks) is barely increasing.  
Even on a small scale, the amount of information may be too large to
store in an impoverished setting (say, an embedded device),
or be too big to conveniently keep in fast storage. 

In response to this challenge, the model of streaming data processing has grown in popularity. 
In this setting, the aim is no longer to capture, store and index
every minute event, but rather to quickly process each observation to
create some summary of the current state.
Following its processing, an event is dropped, and is no longer accessible. 
The summary that is retained is often referred to as sketch of the
Coping with the vast scale of information means making a number of compromises: the description of the world is approximate, rather than exact; 
the nature of queries to be answered must be decided in advance, rather than after the fact; and some questions are now insoluble. 
However, the ability to process vast quantities of data at blinding speeds with modest resources can more than make up for these limitations. 
As a consequence, streaming methods have been adopted in a number of domains, starting with telecommunications, but spreading to search engines, social networks, finance, and time-series analysis.  
These ideas are also finding application in areas where traditional approaches are applicable, but the rough and ready sketching approach is more cost-effective.
Successful applications of sketching involve a mixture of algorithmic tricks, systems know-how, and mathematical insight, and have led to new research contributions in each of these areas. 

In this article, we introduce the ideas behind, and applications, of sketching, with a focus on the algorithmic innovations. 
That is, we describe some algorithmic developments in the abstract,
and then indicate the subsequent steps needed to put them into
practice, with examples. 
We will see four novel algorithmic ideas, and discuss some emerging areas.}
  author = {Edith Cohen and Graham Cormode and Nick Duffield and Carsten 
  title = {On the Tradeoff between Stability and Fit},
  journal = {{ACM} Transactions on Algorithms},
  year = 2016,
  volume = 13,
  number = 1,
  pubsite = {},
  url = {../papers/stability.pdf},
  abstract = {
In computing, as in many aspects of life, changes incur cost. Many 
optimization problems are formulated as a one-time instance starting 
from scratch. However, a common case that arises is when we already have 
a set of prior assignments and must decide how to respond to a new set 
of constraints, given that each change from the current assignment comes 
at a price. That is, we would like to maximize the fitness or efficiency 
of our system, but we need to balance it with the changeout cost from 
the previous state.

We provide a precise formulation for this tradeoff and analyze the 
resulting stable extensions of some fundamental problems in measurement 
and analytics. Our main technical contribution is a stable extension of 
Probability Proportional to Size (PPS) weighted random sampling, with 
applications to monitoring and anomaly detection problems. We also 
provide a general framework that applies to top-k, minimum spanning 
tree, and assignment. In both cases, we are able to provide exact 
solutions and discuss efficient incremental algorithms that can find new 
solutions as the input changes.}
  author = {Chakrabarti, Amit and Cormode, Graham and McGregor, Andrew},
  title = {Robust Lower Bounds for Communication and Stream Computation},
  year = {2016},
  pages = {1--35},
  doi = {10.4086/toc.2016.v012a010},
  publisher = {Theory of Computing},
  journal = {Theory of Computing},
  volume = {12},
  number = {10},
  url = {../papers/rcctoc.pdf},
  link = {},
  abstract = {
   We study the communication complexity of evaluating functions when the
input data is randomly allocated (according to some known distribution)
amongst two or more players, possibly with information overlap.  This
naturally extends previously studied variable partition models such as
the best-case and worst-case partition models.  We aim to understand whether the
hardness of a communication problem holds for almost every allocation of
the input, as opposed to holding for perhaps just a few atypical

A key application is to the heavily studied data stream model. There is
a strong connection between our communication lower bounds and lower
bounds in the data stream model that are ``robust'' to the ordering of
the data.  That is, we prove lower bounds for when the order of the
items in the stream is chosen not adversarially but rather uniformly (or
near-uniformly) from the set of all permutations. This random-order data
stream model has attracted recent interest, since lower bounds here give
stronger evidence for the inherent hardness of streaming problems. 

Our results include the first random-partition communication lower
bounds for problems including multi-party set disjointness and
gap-Hamming-distance. Both are tight. We also extend and improve
previous results for a form of pointer
jumping that is relevant to the problem of selection (in particular,
median finding). Collectively, these results yield lower bounds for a
variety of problems in the random-order data stream model, including
estimating the number of distinct elements, approximating frequency
moments, and quantile estimation.}
  author = {Ge Luo and
               Lu Wang and
               Ke Yi and
               Graham Cormode},
  title = {Quantiles over data streams: experimental comparisons, new analyses,
               and further improvements},
  journal = {The {VLDB} Journal},
  volume = {25},
  number = {4},
  pages = {449--472},
  year = {2016},
  link = {},
  url = {../papers/nquantvldbj.pdf},
  abstract = { A fundamental problem in data management and analysis is to generate descriptions of
  the distribution of data.  It is most common to give such descriptions in terms of
  the cumulative distribution, which is characterized by the quantiles of the data.
  The design and engineering of efficient methods to find these quantiles has attracted
  much study, especially in the case where the data is given incrementally, and we must
  compute the quantiles in an online, streaming fashion.  While such algorithms have
  proved to be extremely useful in practice, there has been limited formal comparison
  of the competing methods, and no comprehensive study of their performance.  In this
  paper, we remedy this deficit by providing a taxonomy of different methods, and
  describe efficient implementations.  In doing so, we propose new variants that have
  not been studied before, yet which outperform existing methods.
  To illustrate this,
  we provide detailed experimental comparisons demonstrating the tradeoffs between
  space, time, and accuracy for quantile computation.}
  author = {Katsiaryna Mirylenka and
               Graham Cormode and
               Themis Palpanas and
               Divesh Srivastava},
  title = {Conditional heavy hitters: detecting interesting correlations in data
  journal = {The {VLDB} Journal},
  volume = {24},
  number = {3},
  pages = {395--414},
  year = {2015},
  pubsite = {},
  url = {../papers/condhh-vldbj.pdf},
  abstract = {
  The notion of {\em heavy hitters}---items that make up a large
  fraction of the population---has been successfully used in a variety
  of applications across sensor and RFID monitoring, network data
  analysis, event mining, and more. 
  Yet this notion often fails to capture the semantics we desire when we observe data in the form of
  correlated pairs. 
  Here, we are interested in items that are {\em conditionally} frequent: when a particular item is frequent within the context of its parent item. 
  In this work, we introduce and formalize the notion of Conditional Heavy Hitters to identify such
  items, with applications in network monitoring, and Markov chain modeling.
  We explore the relationship between Conditional Heavy Hitters and other related notions in the literature, and show analytically and experimentally the usefulness of our approach.
  We introduce several algorithm variations that allow us to efficiently find conditional heavy hitters for input data with very different characteristics, and provide analytical results for their performance.
  Finally, we perform experimental evaluations with several synthetic and real datasets to demonstrate the efficacy of our methods, and to study the behavior of the proposed algorithms for different types of data. 
  author = {Stavros Papadopoulos and
               Graham Cormode and
               Antonios Deligiannakis and
               Minos N. Garofalakis},
  title = {Lightweight Query Authentication on Streams},
  journal = {{ACM} Transactions on Database Systems},
  volume = {39},
  number = {4},
  pages = {30:1--30:45},
  year = 2015,
  pubsite = {},
  url = {../papers/streamauthtods.pdf},
  abstract = {
We consider a \emph{stream outsourcing} setting, where a data owner
delegates the management of a set of disjoint data streams to an
untrusted server. The owner \emph{authenticates} his streams via
signatures. The server processes continuous queries on the union of
the streams for clients trusted by the owner. Along with the results,
the server sends proofs of result correctness derived from the owner's
signatures, which are easily verifiable by the clients. We design
novel constructions for a collection of fundamental problems over streams
represented as \emph{linear algebraic} queries. In particular, our basic schemes
authenticate \emph{dynamic vector sums} and
\emph{dot products}, 
as well as \emph{dynamic matrix products}. 
These techniques can be adapted for authenticating a wide range
of important operations in streaming environments, including group by
queries, joins, in-network aggregation, similarity matching, and event processing.
All our schemes are very \emph{lightweight}, and offer strong \emph{cryptographic}
guarantees derived from formal definitions and proofs. 
We experimentally confirm the practicality of our schemes in the
performance sensitive streaming setting. 
  author = {Graham Cormode and Hossein Jowhari},
  title = {A Second Look at Counting Triangles in Graph Streams (Revised)},
  year = 2017,
  journal = {Theoretical Computer Science},
  pages = {22-30},
  volume = {683},
  pubsite = {},
  url = {../papers/tri9-3.pdf},
  abstract = {
In this paper we present improved results on the problem of counting triangles in edge streamed graphs. 
For graphs with $m$ edges and at least $T$ triangles, we show that an extra look over
the stream yields a two-pass streaming algorithm that uses $O(\frac{m}{\epsilon^{4.5}\sqrt{T}})$ space and
outputs a $(1+\epsilon)$ approximation of the number of triangles in the graph. This
 improves upon the two-pass streaming tester of Braverman, Ostrovsky and Vilenchik, ICALP 2013, which
 distinguishes between triangle-free graphs and graphs with at least $T$ triangle using $O(\frac{m}{T^{1/3}})$ space. Also, in terms of dependence on $T$, we show that
 more passes would not lead to a better space bound. In other words, we prove there is no constant pass streaming algorithm that distinguishes between triangle-free graphs from graphs with at least $T$ triangles using $O(\frac{m}{T^{1/2+\rho}})$ space for any constant $\rho \ge 0$.   
  author = {Pankaj K. Agarwal and
               Graham Cormode and
               Zengfeng Huang and
               Jeff M. Phillips and
               Zhewei Wei and
               Ke Yi},
  title = {Mergeable summaries},
  journal = {{ACM} Transactions on Database Systems},
  volume = {38},
  number = {4},
  year = {2013},
  pages = {26},
  link = {},
  url = {../papers/mergeabletods.pdf},
  abstract = {
      We study the {\em mergeability} of data summaries.  Informally speaking,
  mergeability requires that, given two summaries on two data sets, there
  is a way to merge the two summaries into a single summary on the two data
  sets combined together, while preserving the error and size guarantees.
  This property means that the summaries can be merged in a way akin to other
  algebraic operators such as sum and max,
which is especially useful for computing summaries on massive distributed
data.  Several data summaries are trivially mergeable by construction, most
notably all the {\em sketches} that are linear functions of the data sets.
But some other fundamental
ones like those for heavy hitters and quantiles, are not (known to be)
mergeable.  In this paper, we demonstrate that these summaries are indeed
mergeable or can be made mergeable after appropriate modifications.
Specifically, we show that for $\epsilon$-approximate heavy hitters, there is a
deterministic mergeable summary of size $O(1/\epsilon)$; for $\epsilon$-approximate
quantiles, there is a deterministic summary of size $O((1/\epsilon)
\log(\epsilon n))$ that has a restricted form of mergeability, and a randomized
one of size $O((1/\epsilon) \log^{3/2}(1/\epsilon))$ with full
mergeability. We also extend our results to geometric summaries such as
$\epsilon$-approximations which permit approximate multidimensional range counting queries.  While most of the results in
this paper are theoretical in nature, some of the algorithms are actually
very simple and even perform better than the previously best known 
which we demonstrate through experiments in a simulated sensor network.

We also achieve two results of independent interest: 
(1) we provide the best known randomized streaming bound for $\epsilon$-approximate quantiles that depends only on $\epsilon$, of size $O((1/\epsilon) \log^{3/2}(1/\epsilon))$, and 
(2) we demonstrate that the MG and the SpaceSaving summaries for heavy hitters are isomorphic. 
  author = {Amit Chakrabarti and Graham Cormode and Andrew McGregor and Justin Thaler},
  journal = {{ACM} Transactions on Algorithms},
  year = 2014,
  title = {Annotations in Data Streams},
  volume = 11,
  number = 1,
  url = {../papers/annotationj.pdf},
  abstract = {
  The central goal of data stream algorithms is to process massive streams
  of data using {\em sublinear} storage space. 
  Motivated by work in the database community on outsourcing database and data stream processing, we ask whether the space usage of such algorithms can be further
  reduced by enlisting a more powerful ``helper'' who can {\em annotate}
  the stream as it is read. We do not wish to blindly trust the helper, so we
  require that the algorithm be convinced of having computed a correct
  answer.   We show
  upper bounds that achieve a non-trivial tradeoff between the
  amount of annotation used and the space required to verify it.  We also
  prove lower bounds on such tradeoffs, often nearly matching the upper
  bounds, via notions related to Merlin-Arthur communication complexity.
  Our results cover the classic data stream problems of selection,
  frequency moments, and fundamental graph problems such as
  triangle-freeness and connectivity. 
    Our work is also part of a
  growing trend --- including recent studies of multi-pass streaming,
  read/write streams and randomly ordered streams --- of asking more
  complexity-theoretic questions about data stream processing. 
  It is a recognition that, in addition to practical relevance,  the
  data stream model
raises many interesting theoretical questions in its own right. }
  author = {Amit Chakrabarti and
               Graham Cormode and
               Ranganath Kondapally and
               Andrew McGregor},
  title = {Information Cost Tradeoffs for Augmented Index and Streaming
               Language Recognition},
  journal = {{SIAM} Journal on Computing ({SICOMP})},
  volume = {42},
  number = {1},
  year = {2013},
  pages = {61-83},
  url = {../papers/pqsicomp.pdf},
  link = {},
  abstract = {This paper makes three main contributions to the theory of communication
complexity and stream computation. First, we present new bounds on the
information complexity of \textsc{augmented-index}. In contrast to analogous results for \textsc{index}
by Jain, Radhakrishnan and Sen [\emph{J.~ACM}, 2009], we have to overcome the
significant technical challenge that protocols for \textsc{augmented-index} may violate the
``rectangle property'' due to the inherent input sharing. Second, we use these
bounds to resolve an open problem of Magniez, Mathieu and Nayak [\emph{STOC},
2010] that asked about the multi-pass complexity of recognizing Dyck
languages. This results in a natural separation between the standard
multi-pass model and the multi-pass model that permits reverse passes.  Third,
we present the first \emph{passive memory checkers} that verify the
interaction transcripts of priority queues, stacks, and double-ended queues.
We obtain tight upper and lower bounds for these problems, thereby addressing
an important sub-class of  the memory checking framework of Blum et al.
[\emph{Algorithmica}, 1994]. }
  author = {Graham Cormode and Donatella Firmani},
  journal = {Distributed and Parallel Databases},
  note = {Special issue on Data Summarization on Big Data},
  link = {},
  url = {../papers/l0journal.pdf},
  year = 2014,
  volume = 32,
  number = 3,
  pages = {315-335},
  title = {A unifying framework for $l_0$-sampling algorithms},
  abstract = {
    The problem of building an $l_0$-sampler is to sample
    near-uniformly from the support set of a dynamic multiset.  
    This problem has a variety of applications within data analysis,
    computational geometry and graph algorithms. 
    In this paper, we abstract a set of steps for building an $l_0$-sampler, based on sampling, recovery and selection. 
    We analyze the implementation of an $l_0$-sampler within this
    framework, and show how prior constructions of 
    $l_0$-samplers can all be expressed in terms of these steps. 
    Our experimental contribution is to provide a first detailed study of the
accuracy and computational cost of $l_0$-samplers.}
  title = {Socializing the h-index },
  journal = {Journal of Informetrics },
  volume = {7},
  number = {3},
  pages = {718 - 721},
  year = {2013},
  link = {},
  url = {../papers/soch.pdf},
  author = {Graham Cormode and Qiang Ma and S. Muthukrishnan and Brian Thompson},
  abstract = {A variety of bibliometric measures have been proposed in order to
quantify the impact of researchers and their work. The h-index is a
notable and widely-used example which aims to improve over
simple metrics such as raw counts of papers or citations. However, a
limitation of this measure is that it considers each author in
isolation, and does not account for contributions as part of a
collaborative team. To address this, we propose a natural variant that
we dub the Social h-index. The aim of this measure is to redistribute
the h-index score to reflect an individual's impact on the research
community. In addition to describing this new measure, we provide
examples, discuss its properties, and contrast with other measures. }
  author = {Graham Cormode},
  journal = {SIGMOD Record},
  title = {What does an Associate Editor actually do?},
  year = 2013,
  month = jun,
  volume = 42,
  number = 2,
  pages = {52--58},
  url = {../papers/editor.pdf},
  link = {},
  abstract = {What does a Associate Editor (AE) of a journal actually
do? The answer may be far from obvious. This article
describes the steps that one AE follows in handling a
submission. The aim is to shed light on the process, for
the benefit of authors, reviewers, and other AEs.}
  author = {Graham Cormode and S. Muthukrishnan and Jinyun Yan},
  title = {Studying the source code of scientific research},
  journal = {SIGKDD Explorations},
  month = dec,
  year = 2012,
  volume = 14,
  number = 2,
  pages = {59-62},
  link = {},
  url = {../papers/scienceographykdd.pdf},
  abstract = {
   Just as inspecting the source code of programs tells us a lot
   about the process of programming, inspecting the ``source
   code'' of scientific papers informs on the process of scientific writing. We report on our study of the source of tens
   of thousands of papers from Computer Science and Mathematics.}
  author = {Graham Cormode},
  journal = {SIGMOD Record},
  year = 2013,
  month = mar,
  volume = 42,
  number = 1,
  title = {The Continuous Distributed Monitoring Model},
  link = {TalkCormode11CD.html},
  url = {../papers/cdsurveysigmodrecord.pdf},
  link = {},
  abstract = {
    In the model of continuous distributed monitoring, a number of
    observers each see a stream of observations.  Their goal is to work
    together to compute a function of the union of their observations.
    This can be as simple as counting the total number of observations, or
    more complex non-linear functions such as tracking the entropy of the
    induced distribution.  Assuming that it is too costly to simply
    centralize all the observations, it becomes quite challenging to
    design solutions which provide a good approximation to the current
    answer, while bounding the communication cost of the observers, and
    their other resources such as their space usage.  This survey 
    introduces this model, and describe a selection results in this
    setting, from the simple counting problem to a variety of other
    functions that have been studied.}
  author = {Graham Cormode and S. Muthukrishnan and Ke Yi and Qin Zhang},
  title = {Continuous sampling from distributed streams},
  journal = {Journal of the ACM ({JACM})},
  volume = 59,
  number = 2,
  month = apr,
  year = 2012,
  att_authors = {gc2602},
  att_private = {false},
  url = {../papers/cdsamplejacm.pdf},
  abstract = {
     A fundamental problem in data management is to draw and maintain a sample
     of a large data set, for approximate query answering, selectivity
     estimation, and query planning.  With large, streaming data sets, this
     problem becomes particularly difficult when the data is shared across
     multiple distributed sites.  The main challenge is to ensure that a
     sample is drawn uniformly across the union of the data while minimizing
     the communication needed to run the protocol on the evolving data.  At
     the same time, it is also necessary to make the protocol lightweight, by
     keeping the space and time costs low for each participant.
     In this paper, we present communication-efficient protocols for
     continuously maintaining a sample (both with and without replacement)
     from $k$ distributed streams.  These apply to the case when we want a
     sample from the full streams, and to the sliding window cases of only the
     $W$ most recent elements, or arrivals within the last $w$ time units.  We
     show that our protocols are optimal (up to logarithmic factors), not just
     in terms of the communication used, but also the time and space costs for
     each participant.

  author = {Graham Cormode and Michael Mitzenmacher and Justin Thaler},
  year = 2013,
  volume = 65,
  number = 2,
  pages = {409--442},
  att_authors = {gc2602},
  att_private = {false},
  link = {},
  url = {../papers/graphipj.pdf},
  journal = {Algorithmica},
  title = {Streaming Graph Computations with a Helpful Advisor},
  abstract = {Motivated by the trend to outsource work to commercial cloud computing
services, we consider a variation of the streaming paradigm where a
streaming algorithm can be assisted by a powerful helper that can
provide annotations to the data stream.  We extend previous work on
such {\em annotation models} by considering a number of graph
streaming problems.  Without annotations, streaming algorithms for 
graph problems generally require significant memory;  we show that
for many standard problems, including all graph problems that can 
be expressed with totally unimodular integer programming formulations,
only constant space (measured in words) is
needed for single-pass algorithms given linear-sized annotations.
We also obtain protocols achieving essentially \textit{optimal} tradeoffs between
annotation length and memory usage for several important problems, including
integer matrix-vector multiplication, as well as
shortest $s$-$t$ path in small-diameter graphs. We also obtain non-trivial tradeoffs for minimum 
weight bipartite perfect matching and shortest $s$-$t$ path in general graphs. }
  author = {Graham Cormode and S. Muthukrishnan},
  att_authors = {gc2602},
  att_private = {false},
  journal = {IEEE Software},
  url = {../papers/cmsoft.pdf},
  title = {Approximating Data with the Count-Min Data Structure},
  year = 2012,
  abstract = {Sketch data structures have been introduced to compactly summarize massive data sets and allow key properties of the data to be approximated accurately. In this paper, we introduce a prominent example, the Count-Min sketch, and describe how it accurately solves a fundamental problem of tracking counts of items in large data sets. Applications, implementations and extensions are also discussed.}
  author = {Graham Cormode and S. Muthukrishnan and Ke Yi},
  title = {Algorithms for distributed functional monitoring},
  journal = {{ACM} Transactions on Algorithms},
  att_authors = {gc2602},
  att_private = {false},
  volume = 7,
  number = 2,
  year = 2011,
  pages = {1--21},
  url = {../papers/cdxj.pdf},
  link = {},
  abstract = {
We study what we call {\em functional monitoring} problems.  We have $k$
players each receiving a stream of items, and communicating with a central
coordinator.  Let the multiset of items received by player $i$ up until
time $t$ be $A_i(t)$.  The coordinator's task is to monitor a given
function $f$ computed over the union of the inputs $\cup_{i} A_i(t)$, {\em
  continuously} at all times $t$.  The goal is to minimize the number of
bits communicated between the players and the coordinator.  Of interest is
the approximate version where the coordinator outputs $1$ if $f \geq \tau$
and $0$ if $f\leq (1-\epsilon)\tau$.  This defines the
$(k,f,\tau,\epsilon)$ distributed functional monitoring problem.
Functional monitoring problems are fundamental in distributed systems, in
particular sensor networks, where we must minimize communication; they also
connect to the well studied streaming model and communication complexity.
Yet few formal bounds are known for functional monitoring.

We give upper and lower bounds for the $(k,f,\tau,\epsilon)$ problem for
some of the basic $f$'s.  In particular, we study the frequency moments
$F_p$ for $p=0,1,2$.  For $F_0$ and $F_1$, we obtain monitoring algorithms
with costs almost the same as their one-shot computation algorithms.
However, for $F_2$ the monitoring problem seems much harder.  We give a
carefully constructed multi-round algorithm that uses ``sketch summaries''
at multiple levels of details and solves the $(k,F_2,\tau,\epsilon)$
problem with communication ${O}\sim(k^2/\epsilon + k^{3/2}/\epsilon^3)$.
Our algorithmic techniques are likely to be useful for other functional
monitoring problems as well.
  author = {Graham Cormode and Balachander Krishnamurthy and Walter Willinger},
  title = {A Manifesto for Modeling and Measurement in Social Media},
  att_authors = {gc2602,bk1836,ww9241},
  att_private = {false},
  journal = {First Monday},
  month = sep,
  volume = 15,
  number = 9,
  year = 2010,
  url = {../papers/mmmsm.pdf},
  link = {},
  abstract = {
  Online Social Networks (OSNs) have been the subject of a great deal of study in recent years. The
  majority of this study has used simple models, such as node-and-edge graphs, to describe the data. In
  this paper, we argue that such models, which necessarily limit the structures that can be described and
  omit temporal information, are insufficient to describe and study OSNs. Instead, we propose that a richer
  class of Entity Interaction Network models should be adopted. We outline a checklist of features that
  can help build such a model, and apply it to three popular networks (Twitter, Facebook and YouTube) to
  highlight important features. We also discuss important considerations for the collection, validation and
sharing of OSN data.}
  author = {Graham Cormode and Minos Garofalakis},
  title = {Histograms and Wavelets on Probabilistic Data},
  att_authors = {gc2602},
  att_private = {false},
  journal = {{IEEE} Transactions on Knowledge and Data Engineering},
  volume = 22,
  number = 8,
  year = 2010,
  month = aug,
  pages = {1142-1157},
  url = {../papers/phistj.pdf},
  link = {},
  abstract = {There is a growing realization that uncertain information is a first-class citizen in modern database management. As such, we need techniques to correctly and efficiently process uncertain data in database systems. In particular, data reduction techniques that can produce concise, accurate synopses of large probabilistic relations are crucial. Similar to their deterministic relation counterparts, such compact probabilistic data synopses can form the foundation for human understanding and interactive data exploration, probabilistic query planning and optimization, and fast approximate query processing in probabilistic database systems. In this paper, we introduce definitions and algorithms for building histogram- and Haar wavelet-based synopses on probabilistic data. The core problem is to choose a set of histogram bucket boundaries or wavelet coefficients to optimize the accuracy of the approximate representation of a collection of probabilistic tuples under a given error metric. For a variety of different error metrics, we devise efficient algorithms that construct optimal or near optimal size B histogram and wavelet synopses. This requires careful analysis of the structure of the probability distributions, and novel extensions of known dynamic-programming-based techniques for the deterministic domain. Our experiments show that this approach clearly outperforms simple ideas, such as building summaries for samples drawn from the data distribution, while taking equal or less time.}
  author = {Amit Chakrabarti and Graham Cormode and Andrew McGregor},
  att_authors = {gc2602},
  att_private = {false},
  title = {A Near-Optimal Algorithm for Computing the Entropy of a Stream},
  journal = {{ACM} Transactions on Algorithms},
  year = 2010,
  volume = 6,
  number = 3,
  url = {../papers/entropyj.pdf},
  abstract = {  We describe a simple algorithm for approximating the empirical entropy
  of a stream of $m$ values up to a multiplicative factor of $(1+\epsilon)$ using a single pass, $O(\epsilon^{-2} \log
  (\delta^{-1}) \log m)$ words of space, and $O(\log \epsilon^{-1} + \log
  \log \delta^{-1} + \log \log m)$ processing time per item in the
  stream.  Our algorithm is based upon a novel extension of a method
  introduced by Alon, Matias, and Szegedy.  This improves
  over previous work on this problem.
  We show a space lower bound of $\Omega({\epsilon^{-2}/ \log^2
  (\epsilon^{-1})})$, demonstrating that our algorithm is near-optimal in terms of
  its dependency on $\epsilon$.  

  We show that generalizing to multiplicative-approximation of the $k$-th order entropy requires close to
  linear space for $k\geq 1$. In contrast we show that additive-approximation is possible in a single pass using only poly-logarithmic space.  Lastly, we show how to compute a multiplicative 
  approximation to the entropy of a random walk on an undirected graph.}
  author = {Radu Berinde and Graham Cormode and Piotr Indyk and Martin Strauss},
  att_authors = {gc2602},
  att_private = {false},
  title = {Space-optimal Heavy Hitters with Strong Error Bounds},
  journal = {{ACM} Transactions on Database Systems},
  year = 2010,
  volume = 35,
  number = 4,
  url = {../papers/countersj.pdf},
  abstract = {The problem of finding heavy hitters and approximating the frequencies
of items is at the heart of many problems in data stream analysis. It
has been observed that several proposed solutions to this problem can
outperform their worst-case guarantees on real data. This leads to the
question of whether some stronger bounds can be guaranteed. We answer
this in the positive by showing that a class of ``counter-based
algorithms'' (including the popular and very space-efficient Frequent
and SpaceSaving algorithms) provide much stronger approximation
guarantees than previously known. Specifically, we show that errors in
the approximation of individual elements do not depend on the
frequencies of the most frequent elements, but only on the frequency
of the remaining ``tail.''
This shows that counter-based methods are the
most space-efficient (in fact, space-optimal) algorithms having this
strong error bound. 

This tail guarantee allows these algorithms to solve the ``sparse
recovery'' problem. Here, the goal is to recover a faithful
representation of the vector of frequencies, $f$. We prove that using
space $O(k)$, the algorithms construct an approximation $f^*$ to the
frequency vector $f$ so that the $L_1$ error $||f-f^*||_1$ is close to the best
possible error $\min_{f'} ||f' - f||_1$, where $f'$ ranges over all vectors
with at most $k$ non-zero entries. This improves the previously best
known space bound of about $O(k \log n)$ for streams without element deletions
(where $n$ is the size of the domain from which stream elements are drawn). Other
consequences of the tail guarantees are results for skewed (Zipfian) data, and
guarantees for accuracy of merging multiple summarized streams.  }
  author = {Graham Cormode and Jeffrey Jestes and Feifei Li and Ke Yi},
  att_authors = {gc2602},
  att_private = {false},
  title = {Semantics of Ranking Queries for Probabilistic Data },
  journal = {{IEEE} Transactions on Knowledge and Data Engineering},
  year = 2011,
  volume = 23,
  number = 12,
  pages = {1903--1917},
  url = {../papers/exprankj.pdf},
  abstract = {Recently, there have been several attempts to
propose definitions and algorithms for ranking queries on
probabilistic data.  However, these lack many intuitive
properties of a top-$k$ over deterministic data.  We define numerous
fundamental properties, including {\em exact-$k$}, {\em
  containment}, {\em unique-rank}, {\em value-invariance}, and {\em
  stability}, which are satisfied by ranking queries on certain
data.  We argue these properties should also be carefully studied
in defining ranking queries in probabilistic data, and fulfilled by
definition for ranking uncertain data for most applications.
We propose an intuitive new ranking definition based on the 
observation that the ranks of a tuple across all
possible worlds represent a well-founded rank distribution. We
studied the ranking definitions based on the expectation, the median
and other statistics of this rank distribution for a tuple and
derived the {\em expected rank, median rank and quantile rank}
correspondingly.  We are able to prove that the expected rank, median
rank and quantile rank satisfy all these properties for a ranking
query.  We provide efficient solutions to compute such rankings across
the major models of uncertain data, such as attribute-level and
tuple-level uncertainty. 
Finally, a comprehensive experimental study
confirms the effectiveness of our approach.}
  author = {Graham Cormode and Divesh Srivastava and Ting Yu and Qing Zhang},
  att_authors = {gc2602,ds8961},
  att_private = {false},
  title = {Anonymizing bipartite graph data using safe groupings },
  journal = {The {VLDB} Journal},
  volume = 19,
  number = 1,
  pages = {115--139},
  year = 2010,
  pubsite = {},
  url = {../papers/gprivj.pdf},
  abstract = {Private data often comes in the form of associations between entities,
such as customers and products bought from a pharmacy, which are
naturally represented in the form of a large, sparse bipartite graph.
As with  tabular data, it is desirable to be able to publish
anonymized versions of such data, to allow others to perform ad hoc
analysis of aggregate graph properties.
However, existing tabular anonymization techniques do not give useful
or meaningful results when applied to graphs: small changes or masking
of the edge structure can radically change aggregate graph properties.

We introduce a new family of anonymizations for bipartite graph
data, called $(k,l)$-groupings. These groupings preserve the
underlying graph structure perfectly, and instead anonymize the
mapping from entities to nodes of the graph. We identify a class
of ``safe'' $(k,l)$-groupings that have provable guarantees to
resist a variety of attacks, and show how to find such safe
groupings. We perform experiments on real bipartite graph data to
study the utility of the anonymized version, and the impact of
publishing alternate groupings of the same graph data. Our
experiments demonstrate that $(k,l)$-groupings offer strong
tradeoffs between privacy and utility.}
  title = {Methods for finding frequent items in data streams },
  author = {Graham Cormode and Marios Hadjieleftheriou},
  att_authors = {gc2602,mh6516},
  att_private = {false},
  journal = {The {VLDB} Journal},
  year = 2010,
  volume = 19,
  number = 1,
  pages = {3--20},
  pubsite = {},
  url = {../papers/freqvldbj.pdf},
  abstract = {
  The frequent items problem is to process a stream of items and find all items occurring more than a given fraction of the time. It is one of the most heavily studied problems in data stream mining, dating back to the 1980s. Many applications rely directly or indirectly on finding the frequent items, and implementations are in use in large scale industrial systems. However, there has not been much comparison of the different methods under uniform experimental conditions. It is common to find papers touching on this topic in which important related work is mischaracterized, overlooked, or reinvented. In this paper, we aim to present the most important algorithms for this problem in a common framework. We have created baseline implementations of the algorithms and used these to perform a thorough experimental study of their properties. We give empirical evidence that there is considerable variation in the performance of frequent items algorithms. The best methods can be implemented to find frequent items with high accuracy using only tens of kilobytes of memory, at rates of millions of items per second on cheap modern hardware.}
  title = {Time-decaying Sketches for Robust Aggregation of Sensor Data},
  author = {Graham Cormode and Srikanta Tirthapura and Bojian Xu},
  att_authors = {gc2602},
  att_private = {false},
  journal = {{SIAM} Journal on Computing ({SICOMP})},
  volume = 39,
  number = 4,
  pages = {1309-1339},
  year = {2009},
  url = {../papers/dupdecsicomp.pdf},
  pubsite = {},
  abstract = {
  We present a new sketch for summarizing network data. The sketch has
  the following properties which make it useful in
  communication-efficient aggregation in distributed streaming
  scenarios, such as sensor networks: the sketch is
  duplicate-insensitive, i.e. re-insertions of the same data will not
  affect the sketch, and hence the estimates of aggregates. Unlike
  previous duplicate-insensitive sketches for sensor data
  aggregation, it is also time-decaying, so that
  the weight of a data item in the sketch can decrease with time
  according to a user-specified decay function.  The sketch can give
  provably approximate guarantees for various aggregates of data,
  including the sum, median, quantiles, and frequent elements. The size
  of the sketch and the time taken to update it are both polylogarithmic
  in the size of the relevant data.  Further, multiple sketches computed
  over distributed data can be combined without loss of accuracy. To our
  knowledge, this is the first sketch that combines all the above
  title = {Time-decayed correlated aggregates over data streams},
  author = {Graham Cormode and Srikanta Tirthapura and Bojian Xu},
  att_authors = {gc2602},
  att_private = {false},
  journal = {Statistical Analysis and Data Mining},
  volume = 2,
  number = {5-6},
  pages = {294-310},
  year = 2009,
  url = {../papers/coragsam.pdf},
  pubsite = {},
  abstract = {Data stream analysis frequently relies on identifying correlations and posing conditional queries on the data after it has been seen. Correlated aggregates form an important example of such queries, which ask for an aggregation over one dimension of stream elements which satisfy a predicate on another dimension. Since recent events are typically more important than older ones, time decay should also be applied to downweight less significant values. We present space-efficient algorithms as well as space lower bounds for the time-decayed correlated sum, a problem at the heart of many related aggregations. By considering different fundamental classes of decay functions, we separate cases where efficient approximations with relative error or additive error guarantees are possible, from other cases where linear space is necessary to approximate. In particular, we show that no efficient algorithms with relative error guarantees are possible for the popular sliding window and exponential decay models, resolving an open problem. This negative result for the exponential decay holds even if the stream is allowed to be processed in multiple passes. The results are surprising, since efficient approximations are known for other data stream problems under these decay models. This is a step toward better understanding which sophisticated queries can be answered on massive streams using limited memory and computation.}
  author = {Ke Yi and
               Feifei Li and
               Graham Cormode and
               Marios Hadjieleftheriou and
               George Kollios and
               Divesh Srivastava},
  title = {Small synopses for group-by query verification on outsourced
               data streams},
  att_authors = {gc2602,ds8961,mh6516},
  att_private = {false},
  journal = {{ACM} Transactions on Database Systems},
  volume = {34},
  number = {3},
  year = {2009},
  url = {../papers/pirs.pdf},
  pubsite = {},
  abstract = {
Due to the overwhelming flow of information in many data stream
applications, data outsourcing is a natural and effective paradigm
for individual businesses to address the issue of scale.  In the standard
data outsourcing model, the data owner outsources streaming data to one or
more third-party servers, which answer queries posed by a potentially
large number of clients on the data owner's behalf.  Data outsourcing
intrinsically raises issues of trust, making outsourced query assurance on
data streams a problem with important practical implications.  Existing
solutions proposed in this model all build upon cryptographic primitives
such as signatures and collision-resistant hash functions, which only work for
certain types of queries, e.g., simple selection/aggregation queries.

In this paper, we consider another common type of queries, namely, ``{\tt
  GROUP BY, SUM}'' queries, which previous techniques fail to support.
Our new solutions are not based on cryptographic primitives, but instead
use algebraic and probabilistic techniques to compute a small synopsis on
the true query result, which is then communicated to the client so as to
verify the correctness of the query result returned by the server.  The
synopsis uses a constant amount of space irrespective of the result
size, has an extremely small probability of failure, and can be maintained
using no extra space when the query result changes as elements stream by.
We then generalize our synopsis to allow some tolerance on the number of
erroneous groups, in order to support semantic load shedding on the server.
When the number of erroneous groups is indeed tolerable, the synopsis can
be strengthened so that we can locate and even correct these errors.
Finally, we implement our techniques and perform an empirical evaluation
using live network traffic.}
  author = {Graham Cormode and
               Marios Hadjieleftheriou},
  title = {Finding the frequent items in streams of data},
  journal = {Communications of the {ACM} ({CACM})},
  volume = {52},
  number = {10},
  year = {2009},
  pages = {97-105},
  url = {../papers/freqcacm.pdf},
  pubsite = {},
  abstract = {
The frequent items problem is to process a stream of items and find all
those which  occur more than a given fraction of the time.
It is one of the most heavily studied problems in mining data streams,
dating back to the 1980s.
Many other applications rely directly or indirectly on finding the frequent
items, and implementations are in use in large scale industrial systems.
In this paper, we describe the most important algorithms for
this problem in a common framework.
We place the different solutions in their historical context, and
describe the connections between them, with the aim of clarifying some
of the confusion that has surrounded their properties. 

To further illustrate the different properties of the algorithms, 
we provide baseline implementations.
This allows us to 
 give empirical evidence that there is considerable variation in the
performance of frequent item algorithms.
The best methods can be implemented to find frequent items with high
accuracy using only tens of kilobytes of memory, at rates of millions of
items per second on cheap modern hardware.}
  title = {How NOT to review a paper: The tools and techniques of the adversarial reviewer},
  author = {Graham Cormode},
  journal = {SIGMOD Record},
  att_authors = {gc2602},
  att_private = {false},
  month = dec,
  year = 2008,
  volume = 37,
  number = 4,
  pages = {100-104},
  url = {../papers/adversarial.pdf},
  pubsite = {},
  abstract = {
  There are several useful guides available for how to review a paper in
  Computer Science. 
  These are soberly presented, carefully reasoned and sensibly argued. 
  As a result, they are not much fun. 
  So, as a contrast, this note is a checklist of how {\em not} to review
  a paper.  
  It details techniques that are 
  unethical, unfair, or just plain nasty. 
  Since in Computer Science we often present arguments about how an
  adversary would approach a particular problem, this note 
  describes the adversary's strategy.}
  title = {Key Differences between Web 1.0 and Web 2.0},
  author = {Graham Cormode and Balachander Krishnamurthy},
  att_authors = {gc2602,bk1836},
  att_private = {false},
  journal = {First Monday},
  month = jun,
  volume = 13,
  number = 6,
  year = 2008,
  link = {},
  url = {../papers/web2.pdf},
  abstract = {
Web 2.0 is a buzzword introduced in 200304 which is commonly used to encompass various novel 
phenomena on the World Wide Web. Although largely a marketing term, some of the key attributes 
associated with Web 2.0 include the growth of social networks, bidirectional communication, 
various glue technologies, and significant diversity in content types. We are not aware of a 
technical comparison between Web 1.0 and 2.0. While most of Web 2.0 runs on the same substrate as 
1.0, there are some key differences. We capture those differences and their implications for 
technical work in this paper. Our goal is to identify the primary differences leading to the 
properties of interest in 2.0 to be characterized. We identify novel challenges due to the 
different structures of Web 2.0 sites, richer methods of user interaction, new technologies, and 
fundamentally different philosophy. Although a significant amount of past work can be reapplied, 
some critical thinking is needed for the networking community to analyze the challenges of this 
new and rapidly evolving environment.}
  author = {Graham Cormode and Minos Garofalakis},
  att_authors = {gc2602},
  att_private = {false},
  title = {Approximate continuous querying over distributed streams},
  journal = {{ACM} Transactions on Database Systems},
  year = 2008,
  month = jun,
  link = {},
  volume = 33,
  number = 2,
  abstract = {
While traditional database systems optimize for performance 
on one-shot query processing, emerging large-scale monitoring
applications require continuous tracking of complex data-analysis
queries over collections of physically-distributed streams. 
Thus, effective solutions have to be simultaneously space/time
efficient (at each remote monitor site), communication efficient
(across the underlying communication network), and provide
continuous, guaranteed-quality approximate query answers. 
In this paper, we propose novel algorithmic solutions for the
problem of continuously tracking a broad class of complex 
aggregate queries in such a distributed-streams setting.
Our tracking schemes maintain approximate query answers with
provable error guarantees, while simultaneously optimizing the
storage space and processing time at each remote site, 
and the communication cost across the network.
In a nutshell, our algorithms rely on tracking general-purpose
randomized sketch summaries of local streams at remote sites
along with concise prediction models of local site behavior
in order to produce highly communication- and space/time-efficient 
The end result is a powerful approximate query tracking framework
that readily incorporates several complex analysis queries 
(including distributed join and multi-join aggregates, and
approximate wavelet representations), thus giving the first
known low-overhead tracking solution for such queries in the
distributed-streams model.
Experiments with real data validate our approach, revealing 
significant savings over naive solutions as well as our
analytical worst-case guarantees.}
  author = {Graham Cormode and Flip Korn and S. Muthukrishnan and 
Divesh Srivastava},
  att_authors = {gc2602,ds8961,pk1785},
  att_private = {false},
  title = {Finding Hierarchical Heavy Hitters in Streaming Data},
  journal = {{ACM} Transactions on Knowledge Discovery from Data ({TKDD})},
  month = jan,
  year = 2008,
  volume = 1,
  number = 4,
  url = {../papers/h4.pdf},
  link = {},
  abstract = {
Data items that arrive online as streams
typically have attributes which take values from one or more
hierarchies (time and geographic location; source and
destination IP addresses; etc.). Providing an aggregate view of such
data is important for summarization, visualization, and analysis.
We develop an aggregate view based on certain
organized sets of large-valued regions (``heavy hitters'')
corresponding to hierarchically discounted frequency counts.
We formally define the notion of {\em Hierarchical Heavy Hitters} (HHHs).
We first consider computing (approximate) HHHs over a data stream
drawn from a single hierarchical attribute. 
We formalize the problem and give deterministic algorithms
to find them in a single pass over the input.

In order to analyze a wider range of realistic data streams
(e.g., from IP traffic monitoring applications),
we generalize this problem to multiple dimensions.
Here, the semantics of HHHs are more complex, since a ``child'' node
can have multiple ``parent'' nodes.
We present online algorithms that find approximate HHHs in one pass,
with provable accuracy guarantees.
The product of hierarchical dimensions form a 
mathematical lattice structure. 
Our algorithms exploit this structure, and so are able to 
to track approximate
HHHs using only a small, fixed number of statistics per stored item,
regardless of the number of dimensions.

We show experimentally, using real data, that our
proposed algorithms yield outputs which are very similar
(virtually identical, in many cases) to offline computations
of the exact solutions whereas straightforward heavy hitters
based approaches give significantly inferior answer quality.
Furthermore, the proposed algorithms result in an order of
magnitude savings in data structure size while performing
  journal = {Transactions on Networking},
  title = {What's New: Finding Significant Differences 
in Network Data Streams},
  att_authors = {gc2602},
  att_private = {false},
  author = {G. Cormode and S. Muthukrishnan},
  month = {December},
  year = {2005},
  volume = {13},
  number = {6},
  pages = {1219-1232},
  url = {../papers/changes-ton.pdf},
  link = {CormodeMuthukrishnan04Infocom.html},
  pubsite = {},
  note = {},
  abstract = {
  Monitoring and analyzing network traffic usage patterns is vital for
  managing IP Networks. 
  An important problem is to provide network managers with information
  about changes in traffic, informing them about ``what's new''.
  Specifically, we focus on the challenge of finding significantly large
  differences in traffic: over time, between interfaces and between
  We introduce the idea of a {\em deltoid}: an item that has a large
  whether the difference is {\em absolute}, {\em relative} or
  {\em variational}. 
  We present novel algorithms for finding the most significant deltoids in high
  speed traffic data, and prove that they use small space, very small time per
  update, and are guaranteed to find significant deltoids with
  pre-specified accuracy.
  In experimental evaluation with real network traffic, our algorithms perform well and recover 
  almost all deltoids.
  This is the first work to provide solutions 
capable of working over the data with one pass, at network traffic speeds. }
  author = {G. Cormode and S. Muthukrishnan},
  att_authors = {gc2602},
  att_private = {false},
  title = {What's Hot and What's Not: Tracking Most Frequent Items Dynamically},
  journal = {{ACM} Transactions on Database Systems},
  note = {},
  year = {2005},
  month = {March},
  volume = {30},
  number = {1},
  pages = {249--278},
  url = {../papers/whatshot-tods.pdf},
  link = {CormodeMuthukrishnan03PODS.html},
  pubsite = {},
  abstract = {
Most database management systems maintain statistics
on the underlying relation. One of the important statistics
is that of the 'hot items' in the relation: those
that appear many times (most frequently, or more than some threshold). 
For example, end-biased histograms keep the hot items as part 
of the histogram
and are used in selectivity estimation. 
Hot items are used as simple outliers in data mining, and 
in anomaly detection in many applications.

We present new methods for dynamically
determining the hot items at any time in a relation
which is undergoing deletion operations as well as inserts.
Our methods maintain small space data structures that 
monitor the transactions on the relation, and when required, quickly output
all hot items, without rescanning the relation in the database. 
With user-specified probability, all hot items are correctly reported.
Our methods rely on ideas from 'group testing'.
They are simple to implement, and have provable 
quality, space and time guarantees. 

Previously known algorithms for this problem that make similar quality and
performance guarantees can not handle deletions, and those that handle
deletions can not make similar guarantees without rescanning the database.  Our
experiments with real and synthetic data shows that our
algorithms are accurate in dynamically tracking the hot items
independent of the rate of insertions and deletions. }
  author = {G. Cormode},
  att_authors = {gc2602},
  att_private = {false},
  title = {Representations of the Research Student in Popular Culture},
  year = {2004},
  journal = {Annals of Improbable Research},
  volume = {10},
  number = {1},
  pages = {26-27},
  url = {},
  link = {},
  abstract = {
Attracting new entrants to the world of research relies in part on the existence of role models in the popular media as observed by the current undergraduates. Hence, we examine the world of TV and films that get shown on cable all the time. There is no shortage of visible examples at the professorial level, from Archaeology (Indiana Jones) to Paleontology (Ross from Friends). However, it is a matter of concern that the first stage of an academic career, the research student, is less frequently portrayed. We will consider a number of examples of media portayals of young researchers based on a uniform sample of those I could think of. We analyze whether their depiction will encourage young people to follow their lead into PhD programmes, and also to successfully complete their studies.}
  author = {G. Cormode and S. Muthukrishnan},
  att_authors = {gc2602},
  att_private = {false},
  title = {An Improved Data Stream Summary: The Count-Min Sketch 
                     and its Applications},
  year = {2005},
  journal = {Journal of Algorithms},
  note = {},
  month = {April},
  volume = {55},
  number = {1},
  pages = {58--75},
  abstract = {We introduce a new sublinear space data structure---the Count-Min Sketch--- for summarizing data streams. Our sketch allows fundamental queries in data stream summarization such as point, range, and inner product queries to be approximately answered very quickly; in addition, it can be applied to solve several important problems in data streams such as finding quantiles, frequent items, etc. The time and space bounds we show for using the CM sketch to solve these problems significantly improve those previously known --- typically from $1/epsilon^2$ to $1/\epsilon$ in factor.},
  url = {../papers/cm-full.pdf},
  link = {CormodeMuthukrishnan04CMLatin.html},
  pubsite = {}
  author = {G. Cormode and M. Datar and P. Indyk and 
                   S. Muthukrishnan},
  att_authors = {gc2602},
  att_private = {false},
  title = {Comparing Data Streams Using {H}amming Norms},
  year = {2003},
  pages = {529--541},
  journal = {{IEEE} Transactions on Knowledge and Data Engineering},
  volume = {15},
  number = {3},
  note = {},
  url = {../papers/cdim-hammingnormtkde.pdf},
  link = {CormodeDatarIndykMuthukrishnan02.html},
  pubsite = {},
  abstract = {
Massive data streams are now fundamental to many data processing
applications.  For example, Internet routers produce large scale
diagnostic data streams.  Such streams are rarely stored in
traditional databases, and instead must be processed ``on the fly'' as
they are produced.  Similarly, sensor networks produce multiple data
streams of observations from their sensors.  There is growing focus on
manipulating  data streams, and hence, there is a need to identify
basic operations of interest in managing data streams, and to support
them efficiently.
We propose computation of the Hamming norm as a basic operation of
interest.  The Hamming norm formalizes ideas
that are used throughout data processing.
When applied to a single stream, the Hamming norm gives the number of distinct 
items that are
present in that data stream, which is a statistic of great interest in
databases.  When applied to a pair of streams, the Hamming norm gives
an important measure of (dis)similarity:  the number of unequal item
counts in the two streams. Hamming norms have many uses
in comparing data streams.

We present a novel approximation technique for estimating the Hamming norm
for massive data streams; this relies on what we call the ``$l_0$
{\it sketch}''
and we prove its accuracy. We test our approximation method on a large
quantity of synthetic and real stream data, and show that the
estimation is accurate to within a few percentage points. }
  author = {G. Cormode and S. Muthukrishnan},
  att_authors = {gc2602},
  att_private = {false},
  title = {The String Edit Distance Matching Problem with Moves},
  year = {2007},
  journal = {{ACM} Transactions on Algorithms},
  volume = 3,
  number = 1,
  url = {../papers/editmovestalg.pdf},
  link = {CormodeMuthukrishnan02.html},
  pubsite = {},
  abstract = {
The edit distance between two strings $S$ and $R$
is defined to be the minimum number of character inserts,
deletes and changes needed to convert $R$ to $S$. 
Given a text string $t$ of length $n$, and a pattern string $p$
of length $m$, informally, the string edit distance matching problem is to 
compute the smallest edit distance between $p$ and 
substrings of $t$.

We relax the problem
so that (a) we allow an additional operation, namely, 
{\em substring moves}, and (b) we 
allow approximation of this string edit distance.
Our result is a near linear time deterministic 
algorithm to produce a factor of $O(\log n \log^* n)$
approximation to the string edit distance with moves. 
This is the first
known significantly subquadratic algorithm for a string 
edit distance problem in which the distance involves
nontrivial alignments.
Our results are obtained by embedding strings into $L_1$
vector space using a simplified parsing technique we call 
{\em Edit Sensitive Parsing} (ESP). }
  author = {G. Ozsoyoglu and N. H. Balkir and
                  G. Cormode and  Z. M. Ozsoyoglu},
  att_authors = {gc2602},
  att_private = {false},
  title = {Electronic Books in Digital Libraries},
  year = {2004},
  note = {},
  journal = {{IEEE} Transactions on Knowledge and Data Engineering},
  volume = {16},
  number = {3},
  pages = {317--331},
  url = {../papers/obcotkde.pdf},
  link = {OzsoyogluCormodeBalkirOzsoyoglu00.html},
  pubsite = {},
  abstract = {
Electronic Book is an application with a multimedia database of instructional resources, which include hyperlinked text, instructor's audio/video clips, slides, animation, still images, etc. as well as content-based infomration about these data, and metadata such as annotations, tags, and cross-referencing information. Electronic books in the Internet or on CDs today are not easy to learn from. We propose the use of a multimedia database of instructional resources in constructing and delivering multimedia lessons about topics in an electronic book.

We introduce an electronic book data model containing (a) topic objects and and (b) instructional resources, called instruction modules which are multimedia presentations possibly capturing real-life lectures of instructors. We use the notion of topic prerequisites for topics at different detail levels, to allow electronic book users to request / compose multimedia lessons about topics on the electronic book. We present automated construction of the ''best'' user-tailored lesson (as a multimedia presentation). }

This file was generated by bibtex2html 1.92.