other.bib

@comment{{This file has been generated by bib2bib 1.92}}

@comment{{Command line: C:\Dropbox\data\data\webbib\bib2bib-1.92.exe -oc other -ob other.bib -c '($type = "TECHREPORT" | $type = "UNPUBLISHED" | $type="INCOLLECTION")' cormode.bib}}

@incollection{Cormode24LDP,
  author = {Graham Cormode},
  title = {Local Differential Privacy for Privacy-preserving Machine Learning},
  booktitle = {Differential Privacy in Artificial Intelligence: From Theory to Practice},
  publisher = {{\em now} publishers},
  doi = {http://dx.doi.org/10.1561/9781638284772},
  pages = {42-74},
  url = {../papers/ldpchapter.pdf},
  year = 2025,
  abstract = {As discussed in other chapters, Differential Privacy (DP) provides a widely-accepted model of
privacy based on introducing carefully calibrated random noise to information revealed about
private data. In the standard presentation, the DP model assumes the existence of a trusted aggregator: an entity who holds a collection of information about a population of individuals, and
applies differentially private mechanisms to information computed from this data collection. This
allows accurate statistics and models to be derived, but comes at a cost: we must be satisfied that
we can indeed trust this aggregator to handle the collected data responsibly. In practical applications, this data aggregator is likely to be a powerful entity, such as an internet service provider,
technology company, or government, who may collect the private information of millions of individuals. Hence the potential for misuse may be of some concern, even if we have no prior reason
to suspect the motives of the aggregator.

In response to this, a number of other models of privacy have been considered which aim
to reduce or eliminate the trust placed on the central entity. These can include decentralization---dividing the data among multiple aggregators, so no single one sees the entire information –
or cryptographic techniques---which restrict the view of the aggregator of the raw data. In this
chapter, we survey an approach known as Local Differential Privacy, or LDP. The LDP approach
directly provides a differential privacy guarantee on the results of the computation, and entirely
eliminates the need for a trusted aggregator to hold the private data. However, the LDP approach
comes at some cost: more computational work is needed, and the results achieve a weaker tradeoff
between accuracy and privacy than in the traditional, ``centralized'' differential privacy model.}
}

@incollection{Cormode25,
  author = {Graham Cormode},
  title = {Technical Perspective: Toward Building a Differentially Private {DBMS}},
  link = {https://dl.acm.org/doi/10.1145/3708551},
  booktitle = {Communications of the {ACM} ({CACM})},
  published = {},
  volume = 68,
  number = 3,
  pages = 92,
  year = 2025,
  abstract = {From the demographic statistics produced by national census bodies to the complex predictive models built by companies in “Big Tech” and finance, data is the fuel that powers these applications. Most such use cases rely on data derived from the properties and actions of individual people. This data is therefore considered sensitive and in need of protections to prevent inappropriate use or disclosure. Some protections come from enforcing policies, access control, and contractual agreements. But we also seek technical interventions—definitions and algorithms that can be applied by computer systems to protect private information while still enabling the intended use.}
}

@incollection{Cormode25SIGMOD,
  author = {Graham Cormode},
  title = {Technical Perspective on '{OmniSketch}: Streaming Data Analytics with Arbitrary Predicates'},
  booktitle = {SIGMOD Record},
  publisher = {},
  year = 2025,
  volume = 54,
  number = 1,
  pages = 27,
  url = {../papers/TPomnisketch.pdf},
  doi = {10.1145/3733620.3733626},
  abstract = {Many applications in data management can benefit from a ``data sketch'': a compact data structure that captures the important features of a much larger object. Over the years,
many different sketches have been proposed for finding a variety of foundational statistics about an evolving dataset:
the number of distinct elements, the median element, the frequently occurring elements (heavy hitters), and several
other metrics and norms. These have been used by the data management community for a number of purposes: for efficient measurements when the full data is too large to retain;
to guide query planning; and to support approximate query answering.}
}

@incollection{Cormode24,
  author = {Graham Cormode},
  title = {Technical Perspective on 'Better Differentially Private Approximate Histograms and Heavy Hitters using the Misra-Gries Sketch},
  booktitle = {SIGMOD Record},
  publisher = {ACM},
  month = mar,
  volume = {53(1)},
  pages = 6,
  year = 2024,
  url = {../papers/tpdpmg.pdf},
  abstract = {
  The topics of private data analysis and streaming data management have both been separately the focus of much study within the data management community for many years. 
  However, more recently there have been studies which bring these two previously isolated topics together. 
  
  Although data streams and privacy might not seem to have much in common, it turns out that they share a symbiotic relationship. 
  Within data streams, a common pattern is to design a compact data structure that summarizes the input that has been seen so far, and which can be updated quickly in order to reflect a small change. 
  Within privacy, it is often helpful to build an intermediate representation of a data set, so that a small change to the input does not change the representation much. 
  Moreover, both areas make use of the tools of randomized algorithms and probability to prove that their mechanisms give strong guarantees.  
  Thus, data stream summaries can sometimes inspire private algorithms, and vice-versa.}
}

@incollection{Cormode23,
  author = {Graham Cormode},
  booktitle = {SIGMOD Record},
  publisher = {ACM},
  month = mar,
  volume = {52(1)},
  pages = 114,
  year = 2023,
  url = {../papers/r2t.pdf},
  title = {Technical Perspective on `{R2T: Instance-optimal Truncation for Differentially Private Query Evaluation with Foreign Keys}'},
  abstract = {Increased use of data to inform decision making has brought with it a rising awareness of the importance of privacy, and the need for appropriate mitigations to be put in place to protect the interests of individuals whose data is being processed. 
From the demographic statistics that are produced by national censuses, to the complex predictive models built by ``big tech'' companies, data is the fuel that powers these applications. 
A majority of such uses rely on data that is derived from the properties and actions of individual people. 
This data is therefore considered sensitive, and in need of protections to prevent inappropriate use or disclosure. 
Some protections come from enforcing policies, access control, and contractual agreements. 
But in addition, we also seek technical interventions: definitions and algorithms that can be applied by computer systems in order to protect the private information while still enabling the intended use.  }
}

@incollection{Cormode23PODS,
  author = {Graham Cormode},
  year = 2023,
  booktitle = {{ACM} Principles of Database Systems ({PODS})},
  publisher = {ACM},
  title = {Gems of PODS: Applications of Sketching and Pathways to Impact},
  url = {../papers/podssketches.pdf},
  slides = {../slides/gems-sketches.pdf},
  doi = {https://doi.org/10.1145/3584372.3589937},
  abstract = {  Data summaries (a.k.a., sketches) are compact data structures that can be updated flexibly and efficiently to capture certain properties of a data set. 
  Well-known examples include set summaries (Bloom Filters) and cardinality estimators (e.g., Hyperloglog), amongst others. 
  PODS and SIGMOD have been home to many papers on sketching, including several best paper recipients. 
  Sketch algorithms have emerged from the theoretical research community, but have found wide impact in practice. 
  This paper describes some of the impacts that sketches have had, from online advertising to privacy-preserving data analysis.  
  It will consider the range of different strategies that researchers can follow to encourage the adoption of their work, and what has and has not worked for sketches as a case study.  }
}

@incollection{Cormode21perspective,
  year = 2021,
  author = {G. Cormode},
  title = {Technical Perspective: A Framework for Adversarially Robust Streaming Algorithms},
  publisher = {ACM},
  booktitle = {SIGMOD Record},
  volume = {50(1)},
  month = mar,
  note = {Introduction to research highlights paper by Ben-Eliezer, Jayaram, Woodruff, and Yogev},
  url = {../papers/robuststream.pdf},
  link = {https://sigmodrecord.org/?smd_process_download=1&download_id=7268}
}

@incollection{Cormode19,
  year = 2019,
  title = {The true cost of popularity},
  publisher = {ACM},
  booktitle = {Communications of the {ACM} ({CACM})},
  month = jul,
  author = {G. Cormode},
  note = {Introduction to research highlights paper by Larsen, Nelson, Nguyen and Thorup},
  url = {../papers/popularity-cormode.pdf}
}

@incollection{Cormode19b,
  year = 2019,
  author = {G. Cormode},
  title = {Technical Perspective: {$\epsilon$}{\sc ktelo}: A Framework for Defining Differentially-Private Computations},
  publisher = {ACM},
  booktitle = {SIGMOD Record},
  volume = {48(1)},
  month = mar,
  note = {Introduction to research highlights paper by Zhang, McKenna, Kotsogiannis, Bissias, Hay, Machanavajjhala and Gerome Miklau},
  url = {../papers/ektelo.pdf},
  link = {https://sigmodrecord.org/publications/sigmodRecord/1903/pdfs/05_tp-eaf-cormode.pdf}
}

@techreport{CormodeJowhariMonemizadehMuthukrishnan16,
  author = {Graham Cormode and Hossein Jowhari and Morteza Monemizadeh and S. Muthukrishnan},
  title = {The Sparse Awakens: Streaming Algorithms for Matching Size Estimation in Sparse Graphs},
  year = 2016,
  key = {sparse16},
  institution = {ArXiV},
  link = {http://arxiv.org/abs/1608.03118},
  abstract = {
      Estimating the size of the maximum matching is a canonical problem in
      graph algorithms, and one that has attracted extensive study over a
      range of different computational models. 
      We present improved streaming algorithms for approximating the size of maximum matching with sparse (bounded arboricity) graphs. 
      
      * {\em (Insert-Only Streams)}
      We present a one-pass algorithm that takes  $O(c\log^2 n)$ space and approximates the size of the  maximum matching in 
      graphs with arboricity $c$ within a factor of $O(c)$. This improves significantly upon the state-of-the-art ${O}\sim(cn^{2/3})$-space streaming algorithms.  
      
      * {\em (Dynamic Streams)}
      Given a dynamic graph stream (i.e., inserts and deletes) of edges of an underlying $c$-bounded arboricity graph,  
      we present an one-pass algorithm that uses  space $O(c^{10/3}n^{2/3})$ 
      and returns an $O(c)$-estimator for the size of the maximum matching. 
      This algorithm improves the state-of-the-art ${O}\sim(cn^{4/5})$-space algorithms, where the ${O}\sim(.)$ notation hides logarithmic in $n$ dependencies.
      
      * {\em (Random Order Streams)}
      For randomly ordered graph streams, we present a two-pass streaming algorithm 
      that outputs a $(1+\epsilon)$-approximation of the maximal matching size using $O(d^{d^{d+6}})$ space
      for graphs with maximum degree bounded by $d$. The previous best algorithm for this setting is an $O(\log d)$ approximation. 
      Our algorithm is the first with an approximation guarantee independent
      of $d$. Our result is obtained by simulating a property-testing result 
      in a streaming setting, a technique which might be of independent interest. 
      
       In contrast to the previous works, our results take more advantage of the streaming access to the input and characterize the matching size 
      based on the ordering of the edges in the stream in addition to the degree
      distributions and structural properties of the sparse graphs. 
}
}

@incollection{CormodeCM15,
  author = {Graham Cormode},
  title = {Encyclopedia entry on 'Count-Min Sketch' },
  editor = {Ming-Yang Kao},
  year = 2015,
  url = {../papers/encalgs-cm.pdf},
  pubsite = {http://link.springer.com/referenceworkentry/10.1007%2F978-3-642-27848-8_579-1},
  booktitle = {Encyclopedia of Algorithms},
  publisher = {Springer}
}

@incollection{CormodeAMS15,
  author = {Graham Cormode},
  title = {Encyclopedia entry on 'AMS Sketch' },
  editor = {Ming-Yang Kao},
  year = 2015,
  url = {../papers/encalgs-ams.pdf},
  pubsite = {http://link.springer.com/referenceworkentry/10.1007%2F978-3-642-27848-8_578-1},
  booktitle = {Encyclopedia of Algorithms},
  publisher = {Springer}
}

@incollection{CormodeMG15,
  author = {Graham Cormode},
  title = {Encyclopedia entry on 'Misra-Gries Summary' },
  editor = {Ming-Yang Kao},
  year = 2015,
  url = {../papers/encalgs-mg.pdf},
  pubsite = {http://link.springer.com/referenceworkentry/10.1007%2F978-3-642-27848-8_572-1},
  booktitle = {Encyclopedia of Algorithms},
  publisher = {Springer}
}

@incollection{Cormode11sketch,
  title = {Sketch Techniques for Massive Data},
  author = {Graham Cormode},
  booktitle = {Synposes for Massive Data: Samples, 
Histograms, Wavelets and Sketches},
  att_authors = {gc2602},
  att_private = {false},
  editor = {Graham Cormode and Minos Garofalakis and Peter Haas and Chris Jermaine},
  series = {Foundations and Trends in Databases},
  publisher = {NOW publishers},
  year = 2011,
  url = {../papers/sk.pdf},
  abstract = {Sketch techniques have undergone extensive development within the past few
years. They are especially appropriate for the data streaming scenario, in
which large quantities of data flow by and the the sketch summary must
continually be updated quickly and compactly.  Sketches, as presented here,
are designed so that the update caused by each new piece of data is largely
independent of the current state of the summary.  This design choice makes
them faster to process, and also easy to parallelize.

``Frequency based sketches'' are concerned with summarizing the
observed frequency distribution of a dataset.  From these sketches,
accurate estimations of individual frequencies can be extracted.  This
leads to algorithms to find the approximate heavy hitters (items which
account for a large fraction of the frequency mass) and quantiles (the
median and its generalizations).  The same sketches are also used to
estimate (equi)join sizes between relations, self-join sizes and range
queries.  These can be used as primitives within more complex mining
operations, and to extract wavelet and histogram representations of
streaming data.

A different style of sketch construction leads to sketches for
distinct-value queries.  As mentioned above, using a sample to estimate the
answer to a COUNT DISTINCT query does not give accurate results.  In
contract, sketching methods which can make a pass over the whole data can
provide guaranteed accuracy.  Once built, these sketches estimate not only
the cardinality of a given attribute or combination of attributes, but also
the cardinality of various operations performed on them, such as set
operations (union and difference), and selections based on arbitrary
predicates.}
}

@incollection{Cormode11CD,
  author = {Graham Cormode},
  title = {Continuous Distributed Monitoring: A Short Survey},
  att_authors = {gc2602},
  att_private = {false},
  publisher = {ACM},
  booktitle = {Paper accompanying invited talk at Algorithms and Models for Distributed Event Processing (AlMoDEP)},
  month = sep,
  year = 2011,
  url = {../papers/cdsurvey.pdf},
  slides = {../slides/cdsurvey.pdf},
  abstract = {In the model of continuous distributed monitoring, a number of
observers each see a stream of observations.  Their goal is to work
together to compute a function of the union of their observations.
This can be as simple as counting the total number of observations, or
more complex non-linear functions such as tracking the entropy of the
induced distribution.  Assuming that it is too costly to simply
centralize all the observations, it becomes quite challenging to
design solutions which provide a good approximation to the current
answer, while bounding the communication cost of the observers, and
their other resources such as their space usage.  This survey 
introduces this model, and describe a selection results in this
setting, from the simple counting problem to a variety of other
functions that have been studied.}
}

@incollection{BhagatCormodeMuthukrishnan11,
  author = {S. Bhagat and Graham Cormode and S. Muthukrishnan},
  title = {Node Classification in Social Networks},
  booktitle = {Social Network Data Analytics},
  editor = {Charu C. Aggarwal},
  publisher = {Springer},
  link = {http://arxiv.org/abs/1101.3291},
  url = {../papers/graphlabelchapter.pdf},
  year = 2011,
  abstract = {
  When dealing with large graphs, 
  such as those that arise in the
  context of online social networks, a subset of nodes may be labeled. 
  These labels can indicate demographic values, interest, beliefs or
  other characteristics of the nodes (users). 
  A core problem is to use this information to extend the labeling so
  that all nodes are assigned a label (or labels). 
  
  In this chapter, we survey classification techniques that have been proposed for this
  problem. 
  We consider two broad categories: methods based on iterative
  application of traditional classifiers using graph information as
  features, and methods which propagate the existing labels via random
  walks. 
  We adopt a common perspective on these methods to highlight the
  similarities between different approaches within and across the two
  categories. 
  We also describe some extensions and related directions to the central 
  problem of node classification. }
}

@techreport{CormodeThalerYi10,
  author = {Graham Cormode and Justin Thaler and Ke Yi},
  title = {Verifying Computations with Streaming Interactive Proofs},
  institution = {Electronic Colloquium on Computational Complexity (ECCC)},
  volume = {17},
  year = {2010},
  number = {TR10-159},
  pubsite = {http://eccc.hpi-web.de/report/2010/159}
}

@incollection{Cormode09b,
  title = {Encyclopedia Entry on {'Count-Min Sketch'}},
  author = {Graham Cormode},
  att_authors = {gc2602},
  att_private = {false},
  pages = {511-516},
  year = {2009},
  editor = {Ling Liu and M. Tamer Ozsu},
  url = {../papers/cmencyc.pdf},
  pubsite = {http://www.springerlink.com/content/g8m2427656n4npp8/},
  booktitle = {Encyclopedia of Database Systems},
  publisher = {Springer},
  abstract = {
  The Count-Min (CM) Sketch is a compact summary data structure capable
  of representing a high-dimensional vector and answering queries on this vector, 
  in particular point queries and dot product queries, 
  with strong accuracy guarantees. 
  Such queries are at the core of many computations, so the structure
  can be used in order to answer a variety of other queries, such as
  frequent items (heavy hitters), quantile finding, join size
  estimation, and more. 
  Since the data structure can easily process updates in the form of
  additions or subtractions to dimensions of the vector (which may
  correspond to insertions or deletions, or other transactions), it is
  capable of working over streams of updates, at high rates.
  
  The data structure maintains the linear projection of the vector with
  a number of other random vectors.
  These vectors are defined implicitly by simple hash functions. 
  Increasing the range of the hash functions increases the accuracy of
  the summary, and increasing the number of hash functions decreases the 
  probability of a bad estimate. 
  These tradeoffs are quantified precisely below. 
  Because of this linearity, CM sketches can be scaled, added and
  subtracted, to produce summaries of the corresponding scaled and
  combined vectors. }
}

@techreport{Cormode10,
  author = {Graham Cormode},
  title = {Individual Privacy vs Population Privacy: Learning to Attack Anonymization},
  institution = {arXiv},
  pubsite = {http://arxiv.org/abs/1011.2511},
  number = {arXiv:1011.2511},
  year = 2010,
  abstract = {
  Over the last decade there have been great strides made in developing techniques to compute functions privately. In particular, Differential Privacy gives strong promises about conclusions that can be drawn about an individual. In contrast, various syntactic methods for providing privacy (criteria such as kanonymity and l-diversity) have been criticized for still allowing private information of an individual to be inferred. In this report, we consider the ability of an attacker to use data meeting privacy definitions to build an accurate classifier. We demonstrate that even under Differential Privacy, such classifiers can be used to accurately infer ``private'' attributes in realistic data. We compare this to similar approaches for inferencebased attacks on other forms of anonymized data. We place these attacks on the same scale, and observe that the accuracy of inference of private attributes for Differentially Private data and l-diverse data can be quite similar.}
}

@techreport{CormodeGarofalakis06,
  author = {Graham Cormode and Minos Garofalakis},
  title = {Histograms and Wavelets on Probabilistic Data},
  institution = {arXiv},
  pubsite = {http://arxiv.org/abs/0806.1071},
  number = {arXiv:0806.1071},
  year = 2008,
  abstract = {
There is a growing realization that uncertain information is a first-class citizen in modern 
database management. As such, we need techniques to correctly and efficiently process 
uncertain data in database systems. In particular, data reduction techniques that can produce 
concise, accurate synopses of large probabilistic relations are crucial. Similar to their 
deterministic relation counterparts, such compact probabilistic data synopses can form the 
foundation for human understanding and interactive data exploration, probabilistic query 
planning and optimization, and fast approximate query processing in probabilistic database 
systems.
In this paper, we introduce definitions and algorithms for building histogram- and 
wavelet-based synopses on probabilistic data. The core problem is to choose a set of histogram 
bucket boundaries or wavelet coefficients to optimize the accuracy of the approximate 
representation of a collection of probabilistic tuples under a given error metric. For a 
variety of different error metrics, we devise efficient algorithms that construct optimal or 
near optimal B-term histogram and wavelet synopses. This requires careful analysis of the 
structure of the probability distributions, and novel extensions of known 
dynamic-programming-based techniques for the deterministic domain. Our experiments show that 
this approach clearly outperforms simple ideas, such as building summaries for samples drawn 
from the data distribution, while taking equal or less time.}
}

@techreport{CormodeKornTirthapura07,
  author = {G. Cormode and F. Korn and S. Tirthapura},
  title = {Time Decaying Aggregates in Out-of-order streams},
  year = 2007,
  number = {2007-10},
  institution = {Center for Discrete Mathematics and Computer Science ({DIMACS})},
  link = {http://dimacs.rutgers.edu/TechnicalReports/abstracts/2007/2007-10.html},
  abstract = {
Processing large data streams is now a major topic in data management. 
The data involved can be truly massive, and the required analyses 
complex. In a stream of sequential events such as stock feeds, sensor 
readings, or IP traffic measurements, tuples pertaining to recent events 
are typically more important than older ones. This can be formalized via 
time decay functions, which assign weights based on age. Decay functions 
such as sliding windows and exponential decay have been well studied 
under the assumption of well-ordered arrivals, i.e., data arrives in the 
order of increasing time stamps. However, data quality issues are 
prevalent in massive streams (due to network asynchrony and delays or 
possibly due to features inherent to the measurement process), and 
correct order of arrival cannot be guaranteed. We focus on the 
computation of decayed aggregates such as range queries, quantiles, and 
heavy hitters on out-of-order streams, where elements do not necessarily 
arrive in increasing order of timestamps. We give the first deterministic 
algorithms for approximating these aggregates under the sliding window, 
exponential and polynomial decay functions. Our techniques are based on 
extending existing data stream summaries, such as the q-digest. We show 
that the overhead for allowing out-of-order arrivals is small when 
compared to the case of well-ordered arrivals. Our experimental study 
confirms that these methods can be applied in practice and investigates 
how the specific decay function impacts performance. }
}

@techreport{CormodeMuthukrishnan05CS2,
  author = {G. Cormode and S. Muthukrishnan},
  title = {Combinatorial Algorithms for Compressed Sensing},
  att_authors = {gc2602},
  att_private = {false},
  institution = {Center for Discrete Mathematics and Computer Science ({DIMACS})},
  year = 2005,
  number = {2005-40},
  url = {ftp://dimacs.rutgers.edu/pub/dimacs/TechnicalReports/TechReports/2005/2005-40.pdf},
  link = {http://dimacs.rutgers.edu/TechnicalReports/abstracts/2005/2005-40.html},
  abstract = { 
In sparse approximation theory, the fundamental problem
is to reconstruct a signal A \in R^n from linear measurements 
with respect to a dictionary of psi_i's. Recently, there is focus on the
novel direction of Compressed Sensing where the reconstruction can be
done with very few---O(k log n)---linear measurements over a modified
dictionary if the signal is compressible, that is, its information is
concentrated in k coefficients. In particular, these results prove that
there exists a single O(k log n) x n measurement matrix such that any
such signal can be reconstructed from these measurements, with error at
most O(1) times the worst case error for the class of such signals.
Compressed sensing has generated tremendous excitement both because of
the sophisticated underlying Mathematics and because of its potential
applications.

In this paper, we address outstanding open problems in Compressed
Sensing. Our main result is an explicit construction of a non-adaptive
measurement matrix and the corresponding reconstruction algorithm so
that with number of measurements polynomial in k, log n, 1/$\epsilon$, we
can reconstruct any compressible signal upto 1+epsilon error. This is
the first known polynomial time explicit construction of any such
measurement matrix.  Our result not only improves the error guarantee
from O(1) to 1 + $\epsilon$ but also improve the reconstruction time from
poly(n) to poly(k log n).

Our second result is a randomized construction of a measurement matrix
of size O(k polylog n) that works for each signal with high probability
and gives per-instance approximation guarantee rather than over the
class of all signals. Previous work on Compressed Sensing does not
provide such per-instance approximation guarantee; our result improves
the best known number of measurements known from prior work in other
areas including Learning Theory Streaming algorithms and Complexity
Theory for this case.

Our approach is combinatorial. In particular, we use two parallel sets
of group tests, one to filter and the other to certify and estimate; the
resulting algorithms are quite simple to implement. }
}

@techreport{CormodeMuthukrishnan05CS,
  author = {G. Cormode and S. Muthukrishnan},
  title = {Towards an Algorithmic Theory of Compressed Sensing},
  institution = {Center for Discrete Mathematics and Computer Science ({DIMACS})},
  year = 2005,
  number = {2005-25},
  url = {ftp://dimacs.rutgers.edu/pub/dimacs/TechnicalReports/TechReports/2005/2005-25.pdf},
  link = {http://dimacs.rutgers.edu/TechnicalReports/abstracts/2005/2005-25.html},
  abstract = {
In Approximation Theory, the fundamental problem is to reconstruct a 
signal A in R^n from linear measurements with respect to a dictionary Psi 
for R^n. Recently, there has been tremendous excitement about the novel 
direction of Compressed Sensing [Donoho 04] where the reconstruction can 
be done with very few---O(k log n)---linear measurements over a modified 
dictionary Psi' if the information of the signal is concentrated in k 
coefficients over an orthonormal basis Psi. These results have 
reconstruction error on any given signal that is optimal with respect to a 
broad class of signals. In a series of papers and meetings over the past 
year, a theory of Compressed Sensing has been developed by mathematicians. 
We develop an algorithmic perspective for the Compressed Sensing problem, 
showing that Compressed Sensing results resonate with prior work in Group 
Testing, Learning theory and Streaming algorithms. Our main contributions 
are new algorithms that present the most general results for Compressed 
Sensing with (1+$\epsilon$) approximation on every signal, faster algorithms for 
the reconstruction, as well as succinct transformations of Psi to Psi'. }
}

@incollection{CormodeMuthukrishnan06CISS,
  author = {G. Cormode and S. Muthukrishnan},
  title = {Combinatorial Algorithms for Compressed Sensing},
  booktitle = {Proceedings of Conference on Information Sciences and Systems ({CISS})},
  year = {2006},
  publisher = {IEEE},
  note = {Invited submission},
  slides = {../slides/cs-cissslides.pdf},
  url = {../papers/cs-ciss.pdf},
  link = {https://ieeexplore.ieee.org/document/4067802},
  abstract = {
In sparse approximation theory, the fundamental problem is to reconstruct a
signal $A \in R^n$ from linear measurements $< A,{\psi_i}>$
with respect to a dictionary of $\psi_i$'s.
Recently, there is focus on the novel
direction of {\em Compressed Sensing}
where the reconstruction can
be done with very few---$O(k \log n)$---linear measurements over a modified
dictionary if the signal is {\em compressible}, that is,
its information is concentrated in $k$ coefficients with the original
dictionary.
In particular, these results
prove that there exists a single $O(k\log n) \times n$ measurement
matrix such that any such signal can be reconstructed
from these measurements, with error at most $O(1)$ times the
worst case error for the class of such signals.
Compressed sensing has generated tremendous excitement
both because of the sophisticated underlying Mathematics

In this paper, we address outstanding open problems in
Compressed Sensing.
Our main result is an
explicit construction of a non-adaptive measurement
matrix and the corresponding reconstruction algorithm
so that with a number of measurements polynomial in $k, \log n, 1/\varepsilon$,
we can reconstruct compressible signals.
This is the first known polynomial time
explicit construction of any such measurement matrix. 
In addition,
our result improves the error guarantee from $O(1)$ to
$1 + \varepsilon$ and  improves the reconstruction time
from $poly(n)$ to $poly(k\log n)$.

Our second result is a randomized construction of
$O(k polylog(n))$ measurements
that work for each signal with
high probability and gives per-instance approximation guarantees
rather than over the class of all signals.
Previous work on Compressed Sensing does not provide such per-instance
approximation guarantees; our result
improves the best known number of measurements
known from prior work in other areas including
Learning Theory, Streaming
algorithms and Complexity Theory for this case.

Our approach is combinatorial. In particular, we use two
parallel sets of group tests, one to filter and the other to
certify and estimate; the resulting algorithms are quite simple to
implement.}
}

@incollection{CormodeGarofalakis05DEB,
  author = {G. Cormode and M. Garofalakis},
  title = {Efficient Strategies for
                  Continuous Distributed Tracking Tasks},
  att_authors = {gc2602},
  att_private = {false},
  publisher = {IEEE},
  booktitle = {IEEE Data Engineering Bulletin},
  pages = {33--39},
  month = {March},
  year = {2005},
  url = {../papers/cdtt.pdf},
  note = {},
  abstract = {
While traditional databases have focused on single query
evaluation in a centralized setting, emerging applications
require {\em continuous} tracking of queries on data that is
widely {\em distributed} and constantly updated.
We describe such scenarios, and describe the challenges involved in
designing communication-efficient protocols for the tracking tasks we
define.
We outline some solutions to these problems, by abstracting a model of
the communication system, defining the tracking tasks of interest, and
building query-tracking schemes based on three guiding principles of
minimizing global information, using summaries to capture whole data
streams, and seeking stability of the protocols.}
}

@techreport{CormodeMuthukrishnanRozenbaum05,
  author = {G. Cormode and S. Muthukrishnan and I. Rozenbaum},
  title = {Summarizing and Mining Inverse Distributions on Data 
                   Streams via Dynamic Inverse Sampling},
  number = {2005-11},
  institution = {Center for Discrete Mathematics and Computer Science ({DIMACS})},
  url = {http://dimacs.rutgers.edu/TechnicalReports/abstracts/2005/2005-11.html},
  year = {2005},
  abstract = {
Database management systems face the challenge of
dealing with massive data distributions which
arrive at high speeds while there is only small storage
available for managing and mining them.
Emerging data stream management systems approach this
problem by summarizing and mining the distributions
using samples or sketches. However, data distributions
can be ``viewed'' in different ways. For example,
a data stream of integer values can be viewed either
as the {\em forward} distribution $f(x)$, ie., the
number of occurrences of $x$ in the
stream, or as its inverse, $f^{-1}(i)$, which is
the number of items that appear $i$ times.
While both such ``views'' are equivalent in stored
data systems, over data streams that entail approximations,
they may be significantly different. In other words,
samples and sketches developed for the forward distribution
may be ineffective for summarizing or mining the
inverse distribution. Yet, many applications such as
IP traffic monitoring naturally rely on mining inverse
distributions.

We formalize the problems of managing and mining inverse
distributions and show provable differences between summarizing
the forward distribution vs the inverse distribution.
We present methods for summarizing and mining
inverse distributions of data streams: they rely
on a novel technique to maintain a dynamic sample over the stream with
provable guarantees which can be used for variety of
summarization tasks (building quantiles or equidepth histograms)
and mining (anomaly detection: finding heavy hitters,
and measuring the number of rare items), all with provable
guarantees on quality of approximations and time/space used by
our streaming methods. We also complement our analytical
and algorithmic results by presenting an experimental
study of the methods over network data streams.}
}

@incollection{Cormode05,
  author = {G. Cormode},
  att_authors = {gc2602},
  att_private = {false},
  title = {Some Key Concepts in Data Mining -- Clustering},
  year = {2006},
  booktitle = {Discrete Methods in Epidemiology},
  publisher = {AMS},
  series = {DIMACS},
  pages = {2--9},
  volume = {70},
  url = {../papers/epidcluster.pdf},
  link = {http://www.ams.org/bookstore-getitem/item=dimacs-70}
}

@incollection{CormodeGarofalakis05,
  editor = {M. Garofalakis and J. Gehrke and R. Rastogi},
  booktitle = {Data Stream Management: Processing High-Speed Data 
Streams},
  publisher = {Springer},
  year = {2016},
  author = {G. Cormode and M. Garofalakis},
  title = {Join Sizes, Frequency Moments, and Applications},
  url = {../papers/ams.pdf},
  link = {http://www.springer.com/gb/book/9783540286073},
  abstract = {In this chapter, we focus on a set of problems chiefly inspired by the
problem of estimating the join size between two database relations. 
This problem is at the heart of a wide variety of other problems, both
in databases and beyond. 
Given two relations, $R$ and $S$, and an attribute $a$ common to both
relations, the {\em equi-join} between the two relations, $R$ and $S$, 
consists of
one tuple for each $r \in R$ and $s \in S$ pair such that 
$r.a = s.a$. 
Estimating the joinsize between pairs of relations is a key component
in planning how to answer a complex SQL query that may contain
arbitrary combinations of selection, projection and join tasks. 
The result can also be used to answer some queries approximately. 
This applies directly in a streaming context, and additionally
within a traditional database management system that is operating on
the stream of updates generated by tuple insertions and deletions. 
Knowing the join size is also a important problem at the heart of a
variety of other problems, such as building histogram and wavelet
representations of data, and can be applied to problems such as
finding frequent items and quantiles, and modelling spatial and
high-dimensional data.}
}

@incollection{CormodeIndyk05,
  editor = {M. Garofalakis and J. Gehrke and R. Rastogi},
  booktitle = {Data Stream Management: Processing High-Speed Data 
Streams},
  year = {2016},
  publisher = {Springer},
  link = {../papers/stablechapter.pdf},
  author = {G. Cormode and P. Indyk},
  title = {Stable Distributions in Streaming Computations},
  abstract = {
In many streaming scenarios, we need to measure and quantify the data
that is seen.  
For example, we may want to 
  measure the number of distinct IP addresses
seen over the course of a day, 
compute the difference between incoming
and outgoing transactions in a database system 
or measure the overall
activity in a sensor network. 
More generally, we may want to cluster readings taken over periods of
time or in different places to find patterns, or find the most similar
signal from those previously observed to a new observation. 
For these measurements and comparisons to be meaningful, 
they must be well-defined. 
Here, we will use the well-known and widely used $L_p$ norms.
These encompass the familiar Euclidean (root of sum of squares) and 
Manhattan
(sum of absolute values) norms. 

In this chapter, we will show how $L_p$ distances can be
estimated for massive vectors presented in the streaming model.
This is achieved by making succinct {\em sketches} of the data, which
can be used as synopses of the vectors they summarize.}
}

@techreport{Cormode04LemsTR,
  author = {G. Cormode},
  title = {The Hardness of the Lemmings Game, or, ``{Oh} no, more {NP-C}ompleteness Proofs''},
  year = {2004},
  number = {2004-11},
  link = {Cormode04.html},
  pubsite = {http://dimacs.rutgers.edu/TechnicalReports/abstracts/2004/2004-11.html},
  institution = {Center for Discrete Mathematics and Computer Science ({DIMACS})},
  abstract = {
In the computer game `Lemmings', the player must guide a tribe of green-haired
lemming creatures to safety, and save them from an untimely demise. 
We formulate the decision problem which is, given a level of the game, to
decide whether it is possible to complete the level (and hence to find 
a solution to that level).  
Under certain limitations, this can be decided in polynomial
time, but in general the problem is shown to be NP-Hard.  
This can hold even if there
is only a single lemming to save, thus showing that it is hard to
approximate the number of saved lemmings to any factor.}
}

@techreport{AbelloCormode04,
  author = {J. Abello and G. Cormode},
  title = {Report on {DIMACS} Working Group Meeting: Data Mining 
                   and Epidemiology, {M}arch 18-19, 2004},
  url = {http://dimacs.rutgers.edu/TechnicalReports/abstracts/2004/2004-37.html},
  institution = {Center for Discrete Mathematics and Computer Science ({DIMACS})},
  number = {2004-37},
  year = {2004},
  abstract = {
Epidemiology is an observational science that concerns itself with finding and explaining patterns of health and disease in populations, usually of humans, but also populations of animals, insects and plants. Data mining is an active area of research interested in finding algorithms for describing latent patterns in often very large data sets. This Working Group had the objective of fostering collaboration between these two disciplines. In March of 2004 it organized a two-day meeting at DIMACS to bring these two groups together in a format designed to initiate such collaborations.}
}

@techreport{CormodeCzumajMuthukrishnan04TR,
  author = {G. Cormode and A. Czumaj and S. Muthukrishnan},
  title = {How to {\em increase} the acceptance ratios of top conferences},
  url = {http://dimacs.rutgers.edu/TechnicalReports/abstracts/2004/2004-12.html},
  number = {2004-12},
  institution = {Center for Discrete Mathematics and Computer Science ({DIMACS})},
  year = {2004},
  link = {CormodeCzumajMuthukrishnan04.html},
  abstract = {
In the beginning was the pub. This work was triggered by a pub conversation where the authors observed that many resumes list acceptance ratios of conferences where their papers appear, boasting the low acceptance ratio. The lower the ratio, the better your paper looks. The list might look equally impressive if one listed the rejection ratio of conferences where ones paper was submitted and rejected. We decided to lampoon rather than lament the effort the PC typically put in: wouldnt the world be better if we could encourage only high quality submissions and so run top conferences with very high acceptance ratios? This paper captures our thoughts, and it is best consumed in a pub (and in color).}
}

@techreport{CormodeMuthukrishnan03RadialTR,
  author = {G. Cormode and S. Muthukrishnan},
  title = {Radial Histograms for Spatial Streams},
  year = {2003},
  institution = {Center for Discrete Mathematics and Computer Science ({DIMACS})},
  number = {2003-11},
  url = {ftp://dimacs.rutgers.edu/pub/dimacs/TechnicalReports/TechReports/2003/2003-11.ps.gz},
  pubsite = {http://dimacs.rutgers.edu/TechnicalReports/abstracts/2003/2003-11.html},
  abstract = {
Many data streams relate to geographic or spatial information such as the tracking of moving objects, or location-specific measurements and queries. While several techniques have been developed recently for processing numerical, text or XML streams, new techniques are needed to process spatial queries on streaming geometric data. 

We propose a novel data structure, the Radial Histogram, to process streams including spatial data points. It allows a number of geometric aggregates involving the spread and extent of the points in the data streams---diameter, furthest neighbors, convex hulls---to be accurately estimated to arbitrary precision. By using multiple Radial Histograms for a set of given facilities, we can process data streams consisting of massive numbers of client points. We can then accurately estimate number of spatial aggregates involving relative placement of clients with respect to the facilities, including reverse nearest neighbors, spatial joins and more. 

Radial histograms use very small space and are exceedingly simple to implement. Nevertheless, we prove that they guarantee accurate estimation for spatial aggregate queries mentioned above. An experimental evaluation shows them to perform very well in practice.}
}

@techreport{CormodeMuthukrishnan01EditTR,
  author = {G. Cormode and S. Muthukrishnan},
  title = {The String Edit Distance Matching Problem with Moves},
  url = {ftp://dimacs.rutgers.edu/pub/dimacs/TechnicalReports/TechReports/2001/2001-26.ps.gz},
  pubsite = {http://dimacs.rutgers.edu/TechnicalReports/abstracts/2001/2001-26.html},
  number = {2001-26},
  institution = {Center for Discrete Mathematics and Computer Science ({DIMACS})},
  link = {CormodeMuthukrishnan02.html},
  year = {2001},
  abstract = {
The edit distance between two strings S and R is defined to be the minimum number of character inserts, deletes and changes needed to convert R to S. Given a text string t of length n, and a pattern string p of length m, informally, the string edit distance matching problem is to compute the smallest edit distance between p and substrings of t. A well known dynamic programming algorithm takes time O(nm) to solve this problem, and it is an important open problem in Combinatorial Pattern Matching to significantly improve this bound. 

We relax the problem so that (a) we allow an additional operation, namely, substring moves, and (b) we approximate the string edit distance upto a factor of O(log n log* n).(log*n is the number of times log function is applied to n to produce a constant.) Our result is a neat linear time deterministic algorithm for this version of the problem. This is the first known significantly subquadratic algorithm for a string edit distance problem in which the distance involves nontrivial alignments. Our results are obtained by embedding strings into L_1 vector space using a simplified parsing technique we call Edit Sensitive Parsing (ESP). This embedding is approximately distance preserving, and we show many applications of this embedding to string proximity problems including nearest neighbors, outliers, and streaming computations with strings. }
}

@techreport{CormodeMuthukrishnan02DomTR,
  author = {G. Cormode and S. Muthukrishnan},
  title = {Estimating Dominance Norms of Multiple Data Streams},
  institution = {Center for Discrete Mathematics and Computer Science ({DIMACS})},
  link = {CormodeMuthukrishnan03DN.html},
  year = {2002},
  number = {2002-35},
  url = {ftp://dimacs.rutgers.edu/pub/dimacs/TechnicalReports/TechReports/2002/2002-35.ps.gz},
  pubsite = {http://dimacs.rutgers.edu/TechnicalReports/abstracts/2002/2002-35.html},
  abstract = {
Data streams often consist of multiple signals. Consider a stream of multiple signals (i,a_i,j) where i's correspond to the domain, j's index the different signals and a_i,j >= 0 to the value of the j'th signal at point i. We study the problem of determining the dominance norms over the multiple signals, in particular the max-dominance norm, defined as sum_i max_j {a_i,j}. It is used in applications to estimate the ``worst case influence'' of multiple processes, for example in IP traffic analysis, electrical grid monitoring and financial domain. Besides finding many applications, it is a natural measure: it generalizes the notion of union of data streams and may be alternately thought of as estimating the L1 norm of the upper envelope of multiple signals. 

We present the first known data stream algorithm for estimating max-dominance of multiple signals. In particular, we use workspace and time-per-item that are both sublinear (in fact, poly-logarithmic) in the input size. The algorithm is simple and implementable; its analysis relies on using properties of stable random distributions with small parameter alpha, which may be a technique of independent interest. In contrast we show that other dominance norms -- min-dominance sum_i min_j {a_i,j}, count-dominance (|{i | a_i > b_i}|) or relative-dominance (sum_i a_i/max{1,b_i}), are all impossible to estimate accurately with sublinear space. }
}

@unpublished{Cormode99,
title = {Topic Dependencies for Electronic Books},
author = {G. Cormode},
year = {1999},
note = {(unpublished manuscript)},
url = {../papers/eb.pdf},
abstract = {
Electronics books consist of a number of topics, and information about dependencies between
topics. We examine some theoretical problems related to handling these topic dependencies. In
particular we consider the case where these topic dependencies are acyclic, and nondecomposable
(that is, if topic $a$ and $b$ together require $c$, it is not necessarily the case that $a$ or $b$ alone require
$c$). We formalise the semantics of this system, and give an axiomatisation which we show is
sound and complete. We then show that we can easily compute the closure of a set of all topics
in this model, which is the set of topics which are required by that set. This closure is computed
in an identical manner to that for other formalisations of topic dependencies, showing that for
this purpose no distinction need be drawn.
We then consider a different kind of closure, where given a set of desired topics, we must
find a subset such that the total running time of all topics in its closure is less than a given time
bound. We show that this problem is NP-complete. We analyse some proposed heuristics to
approximate the solution of this problem and demonstrate that inputs can be engineered which
elicit very poor performance from these heuristics, showing that no factor of approximation can
be guaranteed for them. Finally, we transform the problem into a problem over a bipartite
graph, which in turn can be formulated as a problem in mathematical integer programming.
Hardness results for this problem give us confidence that a guaranteed approximation is unlikely
to exist.}
}

@unpublished{Cormode98,
  author = {G. Cormode},
  title = {Springs and Sound Layouts},
  year = {1998},
  pubsite = {../../images/web.gif},
  url = {../papers/sasl.pdf},
  note = {(unpublished manuscript)},
  abstract = {An empirical investigation of graph drawing algorithms performed as a final year project.}
}

@incollection{CormodeWDSA07,
  title = {Computational Fundamentals of Analyzing and Mining Data 
                  Streams},
  author = {G. Cormode},
  year = {2007},
  booktitle = {Workshop on Data Stream Analysis},
  url = {../papers/cormodewdsa.pdf},
  link = {TalkWDSA07.html},
  abstract = { 
Many scenarios, such as network analysis, utility monitoring, and 
financial applications,
generate massive streams of data. These streams consist of millions or 
billions of simple updates every
hour, and must be processed to extract the information described in tiny 
pieces. This survey provides
an introduction the problems of data stream monitoring, and some of the 
techniques that have been
developed over recent years to help mine the data while avoiding drowning 
in these massive flows of
information. In particular, this tutorial introduces the fundamental 
techniques used to create compact
summaries of data streams: sampling, sketching, and other synopsis 
techniques. It describes how to
extract features such as clusters and association rules. Lastly, we see 
methods to detect when and how
the process generating the stream is evolving, indicating some important 
change has occurred.}
}

This file was generated by bibtex2html 1.92.