@comment{{This file has been generated by bib2bib 1.92}}
@comment{{Command line: C:\alldata\data\webbib\bib2bib-1.92.exe -oc article -ob journal.bib -c '$type = "ARTICLE"' cormode.bib}}
@article{CormodeMitzenmacherThaler12Algo,
author = {Graham Cormode and Michael Mitzenmacher and Justin Thaler},
year = 2012,
att_authors = {gc2602},
att_private = {false},
link = {http://arxiv.org/abs/1004.2899},
url = {../papers/graphip.pdf},
journal = {Algorithmica},
title = {Streaming Graph Computations with a Helpful Advisor},
abstract = {Motivated by the trend to outsource work to commercial cloud computing
services, we consider a variation of the streaming paradigm where a
streaming algorithm can be assisted by a powerful helper that can
provide annotations to the data stream. We extend previous work on
such {\em annotation models} by considering a number of graph
streaming problems. Without annotations, streaming algorithms for
graph problems generally require significant memory; we show that
for many standard problems, including all graph problems that can
be expressed with totally unimodular integer programming formulations,
only constant space (measured in words) is
needed for single-pass algorithms given linear-sized annotations.
We also obtain protocols achieving essentially \textit{optimal} tradeoffs between
annotation length and memory usage for several important problems, including
integer matrix-vector multiplication, as well as
shortest $s$-$t$ path in small-diameter graphs. We also obtain non-trivial tradeoffs for minimum
weight bipartite perfect matching and shortest $s$-$t$ path in general graphs. }
}
@article{CormodeMuthukrishnan12,
author = {Graham Cormode and S. Muthukrishnan},
att_authors = {gc2602},
att_private = {false},
journal = {IEEE Software},
url = {../papers/cmsoft.pdf},
title = {Approximating Data with the Count-Min Data Structure},
year = 2012,
abstract = {Sketch data structures have been introduced to compactly summarize massive data sets and allow key properties of the data to be approximated accurately. In this paper, we introduce a prominent example, the Count-Min sketch, and describe how it accurately solves a fundamental problem of tracking counts of items in large data sets. Applications, implementations and extensions are also discussed.}
}
@article{CormodeMuthukrishnanYi11,
author = {Graham Cormode and S. Muthukrishnan and Ke Yi},
title = {Algorithms for distributed functional monitoring},
journal = {{ACM} Transactions on Algorithms},
att_authors = {gc2602},
att_private = {false},
volume = 7,
number = 2,
year = 2011,
pages = {1--21},
url = {../papers/cdxj.pdf},
link = {http://portal.acm.org/citation.cfm?doid=1921659.1921667},
abstract = {
We study what we call {\em functional monitoring} problems. We have $k$
players each receiving a stream of items, and communicating with a central
coordinator. Let the multiset of items received by player $i$ up until
time $t$ be $A_i(t)$. The coordinator's task is to monitor a given
function $f$ computed over the union of the inputs $\cup_{i} A_i(t)$, {\em
continuously} at all times $t$. The goal is to minimize the number of
bits communicated between the players and the coordinator. Of interest is
the approximate version where the coordinator outputs $1$ if $f \geq \tau$
and $0$ if $f\leq (1-\epsilon)\tau$. This defines the
$(k,f,\tau,\epsilon)$ distributed functional monitoring problem.
Functional monitoring problems are fundamental in distributed systems, in
particular sensor networks, where we must minimize communication; they also
connect to the well studied streaming model and communication complexity.
Yet few formal bounds are known for functional monitoring.
We give upper and lower bounds for the $(k,f,\tau,\epsilon)$ problem for
some of the basic $f$'s. In particular, we study the frequency moments
$F_p$ for $p=0,1,2$. For $F_0$ and $F_1$, we obtain monitoring algorithms
with costs almost the same as their one-shot computation algorithms.
However, for $F_2$ the monitoring problem seems much harder. We give a
carefully constructed multi-round algorithm that uses ``sketch summaries''
at multiple levels of details and solves the $(k,F_2,\tau,\epsilon)$
problem with communication $\tilde{O}(k^2/\epsilon + k^{3/2}/\epsilon^3)$.
Our algorithmic techniques are likely to be useful for other functional
monitoring problems as well.
}
}
@article{CormodeKrishnamurthyWillinger10,
author = {Graham Cormode and Balachander Krishnamurthy and Walter Willinger},
title = {A Manifesto for Modeling and Measurement in Social Media},
att_authors = {gc2602,bk1836,ww9241},
att_private = {false},
journal = {First Monday},
month = sep,
volume = 15,
number = 9,
year = 2010,
url = {../papers/mmmsm.pdf},
link = {http://firstmonday.org/htbin/cgiwrap/bin/ojs/index.php/fm/article/view/3072/2601},
abstract = {
Online Social Networks (OSNs) have been the subject of a great deal of study in recent years. The
majority of this study has used simple models, such as node-and-edge graphs, to describe the data. In
this paper, we argue that such models, which necessarily limit the structures that can be described and
omit temporal information, are insufficient to describe and study OSNs. Instead, we propose that a richer
class of Entity Interaction Network models should be adopted. We outline a checklist of features that
can help build such a model, and apply it to three popular networks (Twitter, Facebook and YouTube) to
highlight important features. We also discuss important considerations for the collection, validation and
sharing of OSN data.}
}
@article{CormodeGarofalakis10,
author = {Graham Cormode and Minos Garofalakis},
title = {Histograms and Wavelets on Probabilistic Data},
att_authors = {gc2602},
att_private = {false},
journal = {{IEEE} Transactions on Knowledge and Data Engineering},
volume = 22,
number = 8,
year = 2010,
month = aug,
pages = {1142-1157},
url = {../papers/phistj.pdf},
link = {http://www.computer.org/portal/web/csdl/doi/10.1109/TKDE.2010.66},
abstract = {There is a growing realization that uncertain information is a first-class citizen in modern database management. As such, we need techniques to correctly and efficiently process uncertain data in database systems. In particular, data reduction techniques that can produce concise, accurate synopses of large probabilistic relations are crucial. Similar to their deterministic relation counterparts, such compact probabilistic data synopses can form the foundation for human understanding and interactive data exploration, probabilistic query planning and optimization, and fast approximate query processing in probabilistic database systems. In this paper, we introduce definitions and algorithms for building histogram- and Haar wavelet-based synopses on probabilistic data. The core problem is to choose a set of histogram bucket boundaries or wavelet coefficients to optimize the accuracy of the approximate representation of a collection of probabilistic tuples under a given error metric. For a variety of different error metrics, we devise efficient algorithms that construct optimal or near optimal size B histogram and wavelet synopses. This requires careful analysis of the structure of the probability distributions, and novel extensions of known dynamic-programming-based techniques for the deterministic domain. Our experiments show that this approach clearly outperforms simple ideas, such as building summaries for samples drawn from the data distribution, while taking equal or less time.}
}
@article{ChakrabartiCormodeMcGregor10,
author = {Amit Chakrabarti and Graham Cormode and Andrew McGregor},
att_authors = {gc2602},
att_private = {false},
title = {A Near-Optimal Algorithm for Computing the Entropy of a Stream},
journal = {{ACM} Transactions on Algorithms},
year = 2010,
volume = 6,
number = 3,
url = {../papers/entropyj.pdf},
abstract = { We describe a simple algorithm for approximating the empirical entropy
of a stream of $m$ values up to a multiplicative factor of $(1+\epsilon)$ using a single pass, $O(\epsilon^{-2} \log
(\delta^{-1}) \log m)$ words of space, and $O(\log \epsilon^{-1} + \log
\log \delta^{-1} + \log \log m)$ processing time per item in the
stream. Our algorithm is based upon a novel extension of a method
introduced by Alon, Matias, and Szegedy. This improves
over previous work on this problem.
We show a space lower bound of $\Omega({\epsilon^{-2}/ \log^2
(\epsilon^{-1})})$, demonstrating that our algorithm is near-optimal in terms of
its dependency on $\epsilon$.
We show that generalizing to multiplicative-approximation of the $k$-th order entropy requires close to
linear space for $k\geq 1$. In contrast we show that additive-approximation is possible in a single pass using only poly-logarithmic space. Lastly, we show how to compute a multiplicative
approximation to the entropy of a random walk on an undirected graph.}
}
@article{BerindeCormodeIndykStrauss10,
author = {Radu Berinde and Graham Cormode and Piotr Indyk and Martin Strauss},
att_authors = {gc2602},
att_private = {false},
title = {Space-optimal Heavy Hitters with Strong Error Bounds},
journal = {{ACM} Transactions on Database Systems},
year = 2010,
volume = 35,
number = 4,
url = {../papers/countersj.pdf},
abstract = {The problem of finding heavy hitters and approximating the frequencies
of items is at the heart of many problems in data stream analysis. It
has been observed that several proposed solutions to this problem can
outperform their worst-case guarantees on real data. This leads to the
question of whether some stronger bounds can be guaranteed. We answer
this in the positive by showing that a class of ``counter-based
algorithms'' (including the popular and very space-efficient Frequent
and SpaceSaving algorithms) provide much stronger approximation
guarantees than previously known. Specifically, we show that errors in
the approximation of individual elements do not depend on the
frequencies of the most frequent elements, but only on the frequency
of the remaining ``tail.''
This shows that counter-based methods are the
most space-efficient (in fact, space-optimal) algorithms having this
strong error bound.
This tail guarantee allows these algorithms to solve the ``sparse
recovery'' problem. Here, the goal is to recover a faithful
representation of the vector of frequencies, $f$. We prove that using
space $O(k)$, the algorithms construct an approximation $f^*$ to the
frequency vector $f$ so that the $L_1$ error $||f-f^*||_1$ is close to the best
possible error $\min_{f'} ||f' - f||_1$, where $f'$ ranges over all vectors
with at most $k$ non-zero entries. This improves the previously best
known space bound of about $O(k \log n)$ for streams without element deletions
(where $n$ is the size of the domain from which stream elements are drawn). Other
consequences of the tail guarantees are results for skewed (Zipfian) data, and
guarantees for accuracy of merging multiple summarized streams. }
}
@article{CormodeJestesLiYi10,
author = {Graham Cormode and Jeffrey Jestes and Feifei Li and Ke Yi},
att_authors = {gc2602},
att_private = {false},
title = {Semantics of Ranking Queries for Probabilistic Data },
journal = {{IEEE} Transactions on Knowledge and Data Engineering},
year = 2011,
volume = 23,
number = 12,
pages = {1903--1917},
url = {../papers/exprankj.pdf},
abstract = {Recently, there have been several attempts to
propose definitions and algorithms for ranking queries on
probabilistic data. However, these lack many intuitive
properties of a top-$k$ over deterministic data. We define numerous
fundamental properties, including {\em exact-$k$}, {\em
containment}, {\em unique-rank}, {\em value-invariance}, and {\em
stability}, which are satisfied by ranking queries on certain
data. We argue these properties should also be carefully studied
in defining ranking queries in probabilistic data, and fulfilled by
definition for ranking uncertain data for most applications.
We propose an intuitive new ranking definition based on the
observation that the ranks of a tuple across all
possible worlds represent a well-founded rank distribution. We
studied the ranking definitions based on the expectation, the median
and other statistics of this rank distribution for a tuple and
derived the {\em expected rank, median rank and quantile rank}
correspondingly. We are able to prove that the expected rank, median
rank and quantile rank satisfy all these properties for a ranking
query. We provide efficient solutions to compute such rankings across
the major models of uncertain data, such as attribute-level and
tuple-level uncertainty.
Finally, a comprehensive experimental study
confirms the effectiveness of our approach.}
}
@article{CormodeSrivastavaYuZhang09,
author = {Graham Cormode and Divesh Srivastava and Ting Yu and Qing Zhang},
att_authors = {gc2602,ds8961},
att_private = {false},
title = {Anonymizing bipartite graph data using safe groupings },
journal = {The {VLDB} Journal},
volume = 19,
number = 1,
pages = {115--139},
year = 2010,
pubsite = {http://www.springerlink.com/content/441t1813l0t707ut},
url = {../papers/gprivj.pdf},
abstract = {Private data often comes in the form of associations between entities,
such as customers and products bought from a pharmacy, which are
naturally represented in the form of a large, sparse bipartite graph.
As with tabular data, it is desirable to be able to publish
anonymized versions of such data, to allow others to perform ad hoc
analysis of aggregate graph properties.
However, existing tabular anonymization techniques do not give useful
or meaningful results when applied to graphs: small changes or masking
of the edge structure can radically change aggregate graph properties.
We introduce a new family of anonymizations for bipartite graph
data, called $(k,l)$-groupings. These groupings preserve the
underlying graph structure perfectly, and instead anonymize the
mapping from entities to nodes of the graph. We identify a class
of ``safe'' $(k,l)$-groupings that have provable guarantees to
resist a variety of attacks, and show how to find such safe
groupings. We perform experiments on real bipartite graph data to
study the utility of the anonymized version, and the impact of
publishing alternate groupings of the same graph data. Our
experiments demonstrate that $(k,l)$-groupings offer strong
tradeoffs between privacy and utility.}
}
@article{CormodeHadjieleftheriou09b,
title = {Methods for finding frequent items in data streams },
author = {Graham Cormode and Marios Hadjieleftheriou},
att_authors = {gc2602,mh6516},
att_private = {false},
journal = {The {VLDB} Journal},
year = 2010,
volume = 19,
number = 1,
pages = {3--20},
pubsite = {http://www.springerlink.com/content/u5106x1223212882/},
url = {../papers/freqvldbj.pdf},
abstract = {
The frequent items problem is to process a stream of items and find all items occurring more than a given fraction of the time. It is one of the most heavily studied problems in data stream mining, dating back to the 1980s. Many applications rely directly or indirectly on finding the frequent items, and implementations are in use in large scale industrial systems. However, there has not been much comparison of the different methods under uniform experimental conditions. It is common to find papers touching on this topic in which important related work is mischaracterized, overlooked, or reinvented. In this paper, we aim to present the most important algorithms for this problem in a common framework. We have created baseline implementations of the algorithms and used these to perform a thorough experimental study of their properties. We give empirical evidence that there is considerable variation in the performance of frequent items algorithms. The best methods can be implemented to find frequent items with high accuracy using only tens of kilobytes of memory, at rates of millions of items per second on cheap modern hardware.}
}
@article{CormodeTirthapuraXu09b,
title = {Time-decaying Sketches for Robust Aggregation of Sensor Data},
author = {Graham Cormode and Srikanta Tirthapura and Bojian Xu},
att_authors = {gc2602},
att_private = {false},
journal = {{SIAM} Journal on Computing ({SICOMP})},
volume = 39,
number = 4,
pages = {1309-1339},
year = {2009},
url = {../papers/dupdecsicomp.pdf},
pubsite = {http://dx.doi.org/10.1137/08071795X},
abstract = {
We present a new sketch for summarizing network data. The sketch has
the following properties which make it useful in
communication-efficient aggregation in distributed streaming
scenarios, such as sensor networks: the sketch is
duplicate-insensitive, i.e. re-insertions of the same data will not
affect the sketch, and hence the estimates of aggregates. Unlike
previous duplicate-insensitive sketches for sensor data
aggregation, it is also time-decaying, so that
the weight of a data item in the sketch can decrease with time
according to a user-specified decay function. The sketch can give
provably approximate guarantees for various aggregates of data,
including the sum, median, quantiles, and frequent elements. The size
of the sketch and the time taken to update it are both polylogarithmic
in the size of the relevant data. Further, multiple sketches computed
over distributed data can be combined without loss of accuracy. To our
knowledge, this is the first sketch that combines all the above
properties.}
}
@article{CormodeTirthapuraXu09a,
title = {Time-decayed correlated aggregates over data streams},
author = {Graham Cormode and Srikanta Tirthapura and Bojian Xu},
att_authors = {gc2602},
att_private = {false},
journal = {Statistical Analysis and Data Mining},
volume = 2,
number = {5-6},
pages = {294-310},
year = 2009,
url = {../papers/coragsam.pdf},
pubsite = {http://www3.interscience.wiley.com/journal/122687469/abstract},
abstract = {Data stream analysis frequently relies on identifying correlations and posing conditional queries on the data after it has been seen. Correlated aggregates form an important example of such queries, which ask for an aggregation over one dimension of stream elements which satisfy a predicate on another dimension. Since recent events are typically more important than older ones, time decay should also be applied to downweight less significant values. We present space-efficient algorithms as well as space lower bounds for the time-decayed correlated sum, a problem at the heart of many related aggregations. By considering different fundamental classes of decay functions, we separate cases where efficient approximations with relative error or additive error guarantees are possible, from other cases where linear space is necessary to approximate. In particular, we show that no efficient algorithms with relative error guarantees are possible for the popular sliding window and exponential decay models, resolving an open problem. This negative result for the exponential decay holds even if the stream is allowed to be processed in multiple passes. The results are surprising, since efficient approximations are known for other data stream problems under these decay models. This is a step toward better understanding which sophisticated queries can be answered on massive streams using limited memory and computation.}
}
@article{YiLiCormodeHadjieleftheriouKolliosSrivastava09,
author = {Ke Yi and
Feifei Li and
Graham Cormode and
Marios Hadjieleftheriou and
George Kollios and
Divesh Srivastava},
title = {Small synopses for group-by query verification on outsourced
data streams},
att_authors = {gc2602,ds8961,mh6516},
att_private = {false},
journal = {{ACM} Transactions on Database Systems},
volume = {34},
number = {3},
year = {2009},
url = {../papers/pirs.pdf},
pubsite = {http://portal.acm.org/citation.cfm?doid=1567274.1567277},
abstract = {
Due to the overwhelming flow of information in many data stream
applications, data outsourcing is a natural and effective paradigm
for individual businesses to address the issue of scale. In the standard
data outsourcing model, the data owner outsources streaming data to one or
more third-party servers, which answer queries posed by a potentially
large number of clients on the data owner's behalf. Data outsourcing
intrinsically raises issues of trust, making outsourced query assurance on
data streams a problem with important practical implications. Existing
solutions proposed in this model all build upon cryptographic primitives
such as signatures and collision-resistant hash functions, which only work for
certain types of queries, e.g., simple selection/aggregation queries.
In this paper, we consider another common type of queries, namely, ``{\tt
GROUP BY, SUM}'' queries, which previous techniques fail to support.
Our new solutions are not based on cryptographic primitives, but instead
use algebraic and probabilistic techniques to compute a small synopsis on
the true query result, which is then communicated to the client so as to
verify the correctness of the query result returned by the server. The
synopsis uses a constant amount of space irrespective of the result
size, has an extremely small probability of failure, and can be maintained
using no extra space when the query result changes as elements stream by.
We then generalize our synopsis to allow some tolerance on the number of
erroneous groups, in order to support semantic load shedding on the server.
When the number of erroneous groups is indeed tolerable, the synopsis can
be strengthened so that we can locate and even correct these errors.
Finally, we implement our techniques and perform an empirical evaluation
using live network traffic.}
}
@article{CormodeHadjieleftheriou09,
author = {Graham Cormode and
Marios Hadjieleftheriou},
title = {Finding the frequent items in streams of data},
journal = {Communications of the {ACM} ({CACM})},
volume = {52},
number = {10},
year = {2009},
pages = {97-105},
url = {../papers/freqcacm.pdf},
pubsite = {http://portal.acm.org/citation.cfm?doid=1562764.1562789},
abstract = {
The frequent items problem is to process a stream of items and find all
those which occur more than a given fraction of the time.
It is one of the most heavily studied problems in mining data streams,
dating back to the 1980s.
Many other applications rely directly or indirectly on finding the frequent
items, and implementations are in use in large scale industrial systems.
In this paper, we describe the most important algorithms for
this problem in a common framework.
We place the different solutions in their historical context, and
describe the connections between them, with the aim of clarifying some
of the confusion that has surrounded their properties.
To further illustrate the different properties of the algorithms,
we provide baseline implementations.
This allows us to
give empirical evidence that there is considerable variation in the
performance of frequent item algorithms.
The best methods can be implemented to find frequent items with high
accuracy using only tens of kilobytes of memory, at rates of millions of
items per second on cheap modern hardware.}
}
@article{Cormode09,
title = {How NOT to review a paper: The tools and techniques of the adversarial reviewer},
author = {Graham Cormode},
journal = {SIGMOD Record},
att_authors = {gc2602},
att_private = {false},
month = dec,
year = 2008,
volume = 37,
number = 4,
pages = {100-104},
url = {../papers/adversarial.pdf},
pubsite = {http://portal.acm.org/citation.cfm?id=1519103.1519122},
abstract = {
There are several useful guides available for how to review a paper in
Computer Science.
These are soberly presented, carefully reasoned and sensibly argued.
As a result, they are not much fun.
So, as a contrast, this note is a checklist of how {\em not} to review
a paper.
It details techniques that are
unethical, unfair, or just plain nasty.
Since in Computer Science we often present arguments about how an
adversary would approach a particular problem, this note
describes the adversary's strategy.}
}
@article{CormodeKrishnamurthy08,
title = {Key Differences between Web 1.0 and Web 2.0},
author = {Graham Cormode and Balachander Krishnamurthy},
att_authors = {gc2602,bk1836},
att_private = {false},
journal = {First Monday},
month = jun,
volume = 13,
number = 6,
year = 2008,
link = {http://www.uic.edu/htbin/cgiwrap/bin/ojs/index.php/fm/article/view/2125/1972},
url = {../papers/web2.pdf},
abstract = {
Web 2.0 is a buzzword introduced in 200304 which is commonly used to encompass various novel
phenomena on the World Wide Web. Although largely a marketing term, some of the key attributes
associated with Web 2.0 include the growth of social networks, bidirectional communication,
various glue technologies, and significant diversity in content types. We are not aware of a
technical comparison between Web 1.0 and 2.0. While most of Web 2.0 runs on the same substrate as
1.0, there are some key differences. We capture those differences and their implications for
technical work in this paper. Our goal is to identify the primary differences leading to the
properties of interest in 2.0 to be characterized. We identify novel challenges due to the
different structures of Web 2.0 sites, richer methods of user interaction, new technologies, and
fundamentally different philosophy. Although a significant amount of past work can be reapplied,
some critical thinking is needed for the networking community to analyze the challenges of this
new and rapidly evolving environment.}
}
@article{CormodeGarofalakis08,
author = {Graham Cormode and Minos Garofalakis},
att_authors = {gc2602},
att_private = {false},
title = {Approximate continuous querying over distributed streams},
journal = {{ACM} Transactions on Database Systems},
year = 2008,
month = jun,
link = {http://portal.acm.org/citation.cfm?doid=1366102.1366106},
volume = 33,
number = 2,
abstract = {
While traditional database systems optimize for performance
on one-shot query processing, emerging large-scale monitoring
applications require continuous tracking of complex data-analysis
queries over collections of physically-distributed streams.
Thus, effective solutions have to be simultaneously space/time
efficient (at each remote monitor site), communication efficient
(across the underlying communication network), and provide
continuous, guaranteed-quality approximate query answers.
In this paper, we propose novel algorithmic solutions for the
problem of continuously tracking a broad class of complex
aggregate queries in such a distributed-streams setting.
Our tracking schemes maintain approximate query answers with
provable error guarantees, while simultaneously optimizing the
storage space and processing time at each remote site,
and the communication cost across the network.
In a nutshell, our algorithms rely on tracking general-purpose
randomized sketch summaries of local streams at remote sites
along with concise prediction models of local site behavior
in order to produce highly communication- and space/time-efficient
solutions.
The end result is a powerful approximate query tracking framework
that readily incorporates several complex analysis queries
(including distributed join and multi-join aggregates, and
approximate wavelet representations), thus giving the first
known low-overhead tracking solution for such queries in the
distributed-streams model.
Experiments with real data validate our approach, revealing
significant savings over naive solutions as well as our
analytical worst-case guarantees.}
}
@article{CormodeKornMuthukrishnanSrivastava08,
author = {Graham Cormode and Flip Korn and S. Muthukrishnan and
Divesh Srivastava},
att_authors = {gc2602,ds8961,pk1785},
att_private = {false},
title = {Finding Hierarchical Heavy Hitters in Streaming Data},
journal = {{ACM} Transactions on Knowledge Discovery from Data ({TKDD})},
month = jan,
year = 2008,
volume = 1,
number = 4,
url = {../papers/h4.pdf},
link = {http://doi.acm.org/10.1145/1324172.1324174},
abstract = {
Data items that arrive online as streams
typically have attributes which take values from one or more
hierarchies (time and geographic location; source and
destination IP addresses; etc.). Providing an aggregate view of such
data is important for summarization, visualization, and analysis.
We develop an aggregate view based on certain
organized sets of large-valued regions (``heavy hitters'')
corresponding to hierarchically discounted frequency counts.
We formally define the notion of {\em Hierarchical Heavy Hitters} (HHHs).
We first consider computing (approximate) HHHs over a data stream
drawn from a single hierarchical attribute.
We formalize the problem and give deterministic algorithms
to find them in a single pass over the input.
In order to analyze a wider range of realistic data streams
(e.g., from IP traffic monitoring applications),
we generalize this problem to multiple dimensions.
Here, the semantics of HHHs are more complex, since a ``child'' node
can have multiple ``parent'' nodes.
We present online algorithms that find approximate HHHs in one pass,
with provable accuracy guarantees.
The product of hierarchical dimensions form a
mathematical lattice structure.
Our algorithms exploit this structure, and so are able to
to track approximate
HHHs using only a small, fixed number of statistics per stored item,
regardless of the number of dimensions.
We show experimentally, using real data, that our
proposed algorithms yield outputs which are very similar
(virtually identical, in many cases) to offline computations
of the exact solutions whereas straightforward heavy hitters
based approaches give significantly inferior answer quality.
Furthermore, the proposed algorithms result in an order of
magnitude savings in data structure size while performing
competitively.}
}
@article{CormodeMuthukrishnan05ToN,
journal = {Transactions on Networking},
title = {What's New: Finding Significant Differences
in Network Data Streams},
att_authors = {gc2602},
att_private = {false},
author = {G. Cormode and S. Muthukrishnan},
month = {December},
year = {2005},
volume = {13},
number = {6},
pages = {1219-1232},
url = {../papers/changes-ton.pdf},
link = {CormodeMuthukrishnan04Infocom.html},
pubsite = {http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?isnumber=33142&arnumber=1561219&count=17&index=1},
note = {{\small {\it This paper is © IEEE}}},
abstract = {
Monitoring and analyzing network traffic usage patterns is vital for
managing IP Networks.
An important problem is to provide network managers with information
about changes in traffic, informing them about ``what's new''.
Specifically, we focus on the challenge of finding significantly large
differences in traffic: over time, between interfaces and between
routers.
We introduce the idea of a {\em deltoid}: an item that has a large
difference,
whether the difference is {\em absolute}, {\em relative} or
{\em variational}.
We present novel algorithms for finding the most significant deltoids in high
speed traffic data, and prove that they use small space, very small time per
update, and are guaranteed to find significant deltoids with
pre-specified accuracy.
In experimental evaluation with real network traffic, our algorithms perform well and recover
almost all deltoids.
This is the first work to provide solutions
capable of working over the data with one pass, at network traffic speeds. }
}
@article{CormodeMuthukrishnan05TODS,
author = {G. Cormode and S. Muthukrishnan},
att_authors = {gc2602},
att_private = {false},
title = {What's Hot and What's Not: Tracking Most Frequent Items Dynamically},
journal = {{ACM} Transactions on Database Systems},
note = {{\small {\it This paper is © ACM}}},
year = {2005},
month = {March},
volume = {30},
number = {1},
pages = {249--278},
url = {../papers/whatshot-tods.pdf},
link = {CormodeMuthukrishnan03PODS.html},
pubsite = {http://dl.acm.org/citation.cfm?id=1061325},
abstract = {
Most database management systems maintain statistics
on the underlying relation. One of the important statistics
is that of the 'hot items' in the relation: those
that appear many times (most frequently, or more than some threshold).
For example, end-biased histograms keep the hot items as part
of the histogram
and are used in selectivity estimation.
Hot items are used as simple outliers in data mining, and
in anomaly detection in many applications.
We present new methods for dynamically
determining the hot items at any time in a relation
which is undergoing deletion operations as well as inserts.
Our methods maintain small space data structures that
monitor the transactions on the relation, and when required, quickly output
all hot items, without rescanning the relation in the database.
With user-specified probability, all hot items are correctly reported.
Our methods rely on ideas from 'group testing'.
They are simple to implement, and have provable
quality, space and time guarantees.
Previously known algorithms for this problem that make similar quality and
performance guarantees can not handle deletions, and those that handle
deletions can not make similar guarantees without rescanning the database. Our
experiments with real and synthetic data shows that our
algorithms are accurate in dynamically tracking the hot items
independent of the rate of insertions and deletions. }
}
@article{Cormode04AIR,
author = {G. Cormode},
att_authors = {gc2602},
att_private = {false},
title = {Representations of the Research Student in Popular Culture},
year = {2004},
journal = {Annals of Improbable Research},
volume = {10},
number = {1},
pages = {26-27},
url = {http://dimacs.rutgers.edu/~graham/silly/grad.html},
link = {http://www.improbable.com/airchives/paperair/volume10/v10i1/v10i1-toc.html},
abstract = {
Attracting new entrants to the world of research relies in part on the existence of role models in the popular media as observed by the current undergraduates. Hence, we examine the world of TV and films that get shown on cable all the time. There is no shortage of visible examples at the professorial level, from Archaeology (Indiana Jones) to Paleontology (Ross from Friends). However, it is a matter of concern that the first stage of an academic career, the research student, is less frequently portrayed. We will consider a number of examples of media portayals of young researchers based on a uniform sample of those I could think of. We analyze whether their depiction will encourage young people to follow their lead into PhD programmes, and also to successfully complete their studies.}
}
@article{CormodeMuthukrishnan04CMJalg,
author = {G. Cormode and S. Muthukrishnan},
att_authors = {gc2602},
att_private = {false},
title = {An Improved Data Stream Summary: The Count-Min Sketch
and its Applications},
year = {2005},
journal = {Journal of Algorithms},
note = {{\small {\it This paper is © Elsevier}}},
month = {April},
volume = {55},
number = {1},
pages = {58--75},
abstract = {We introduce a new sublinear space data structure---the Count-Min Sketch--- for summarizing data streams. Our sketch allows fundamental queries in data stream summarization such as point, range, and inner product queries to be approximately answered very quickly; in addition, it can be applied to solve several important problems in data streams such as finding quantiles, frequent items, etc. The time and space bounds we show for using the CM sketch to solve these problems significantly improve those previously known --- typically from $1/epsilon^2$ to $1/\epsilon$ in factor.},
url = {../papers/cm-full.pdf},
link = {CormodeMuthukrishnan04CMLatin.html},
pubsite = {http://dx.doi.org/10.1016/j.jalgor.2003.12.001}
}
@article{CormodeDatarIndykMuthukrishnan03,
author = {G. Cormode and M. Datar and P. Indyk and
S. Muthukrishnan},
att_authors = {gc2602},
att_private = {false},
title = {Comparing Data Streams Using {H}amming Norms},
year = {2003},
pages = {529--541},
journal = {{IEEE} Transactions on Knowledge and Data Engineering},
volume = {15},
number = {3},
note = {{\small {\it This paper is © IEEE}}},
url = {../papers/cdim-hammingnormtkde.pdf},
link = {CormodeDatarIndykMuthukrishnan02.html},
pubsite = {http://www.computer.org/tkde/tk2003/k0529abs.htm},
abstract = {
Massive data streams are now fundamental to many data processing
applications. For example, Internet routers produce large scale
diagnostic data streams. Such streams are rarely stored in
traditional databases, and instead must be processed ``on the fly'' as
they are produced. Similarly, sensor networks produce multiple data
streams of observations from their sensors. There is growing focus on
manipulating data streams, and hence, there is a need to identify
basic operations of interest in managing data streams, and to support
them efficiently.
We propose computation of the Hamming norm as a basic operation of
interest. The Hamming norm formalizes ideas
that are used throughout data processing.
When applied to a single stream, the Hamming norm gives the number of distinct
items that are
present in that data stream, which is a statistic of great interest in
databases. When applied to a pair of streams, the Hamming norm gives
an important measure of (dis)similarity: the number of unequal item
counts in the two streams. Hamming norms have many uses
in comparing data streams.
We present a novel approximation technique for estimating the Hamming norm
for massive data streams; this relies on what we call the ``$l_0$
{\it sketch}''
and we prove its accuracy. We test our approximation method on a large
quantity of synthetic and real stream data, and show that the
estimation is accurate to within a few percentage points. }
}
@article{CormodeMuthukrishnan07,
author = {G. Cormode and S. Muthukrishnan},
att_authors = {gc2602},
att_private = {false},
title = {The String Edit Distance Matching Problem with Moves},
year = {2007},
journal = {{ACM} Transactions on Algorithms},
volume = 3,
number = 1,
url = {../papers/editmovestalg.pdf},
link = {CormodeMuthukrishnan02.html},
pubsite = {http://dx.doi.org/10.1145/1186810.1186812},
abstract = {
The edit distance between two strings $S$ and $R$
is defined to be the minimum number of character inserts,
deletes and changes needed to convert $R$ to $S$.
Given a text string $t$ of length $n$, and a pattern string $p$
of length $m$, informally, the string edit distance matching problem is to
compute the smallest edit distance between $p$ and
substrings of $t$.
We relax the problem
so that (a) we allow an additional operation, namely,
{\em substring moves}, and (b) we
allow approximation of this string edit distance.
Our result is a near linear time deterministic
algorithm to produce a factor of $O(\log n \log^* n)$
approximation to the string edit distance with moves.
This is the first
known significantly subquadratic algorithm for a string
edit distance problem in which the distance involves
nontrivial alignments.
Our results are obtained by embedding strings into $L_1$
vector space using a simplified parsing technique we call
{\em Edit Sensitive Parsing} (ESP). }
}
@article{OzsoyogluCormodeBalkirOzsoyoglu04,
author = {G. Ozsoyoglu and N. H. Balkir and
G. Cormode and Z. M. Ozsoyoglu},
att_authors = {gc2602},
att_private = {false},
title = {Electronic Books in Digital Libraries},
year = {2004},
note = {{\small {\it This paper is © IEEE}}},
journal = {{IEEE} Transactions on Knowledge and Data Engineering},
volume = {16},
number = {3},
pages = {317--331},
url = {../papers/obcotkde.pdf},
link = {OzsoyogluCormodeBalkirOzsoyoglu00.html},
pubsite = {http://csdl.computer.org/comp/trans/tk/2004/03/k0317abs.htm},
abstract = {
Electronic Book is an application with a multimedia database of instructional resources, which include hyperlinked text, instructor's audio/video clips, slides, animation, still images, etc. as well as content-based infomration about these data, and metadata such as annotations, tags, and cross-referencing information. Electronic books in the Internet or on CDs today are not easy to learn from. We propose the use of a multimedia database of instructional resources in constructing and delivering multimedia lessons about topics in an electronic book.
We introduce an electronic book data model containing (a) topic objects and and (b) instructional resources, called instruction modules which are multimedia presentations possibly capturing real-life lectures of instructors. We use the notion of topic prerequisites for topics at different detail levels, to allow electronic book users to request / compose multimedia lessons about topics on the electronic book. We present automated construction of the ''best'' user-tailored lesson (as a multimedia presentation). }
}
This file was generated by bibtex2html 1.92.