Google OR-Tools v9.15
a fast and portable software suite for combinatorial optimization
Loading...
Searching...
No Matches
space_saving_most_frequent.h
Go to the documentation of this file.
1// Copyright 2010-2025 Google LLC
2// Licensed under the Apache License, Version 2.0 (the "License");
3// you may not use this file except in compliance with the License.
4// You may obtain a copy of the License at
5//
6// http://www.apache.org/licenses/LICENSE-2.0
7//
8// Unless required by applicable law or agreed to in writing, software
9// distributed under the License is distributed on an "AS IS" BASIS,
10// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11// See the License for the specific language governing permissions and
12// limitations under the License.
13
14#ifndef ORTOOLS_ALGORITHMS_SPACE_SAVING_MOST_FREQUENT_H_
15#define ORTOOLS_ALGORITHMS_SPACE_SAVING_MOST_FREQUENT_H_
16
17#include <cstddef>
18#include <cstdint>
19#include <functional>
20#include <utility>
21#include <vector>
22
23#include "absl/base/attributes.h"
24#include "absl/base/nullability.h"
25#include "absl/container/flat_hash_set.h"
26#include "absl/hash/hash.h"
27#include "absl/log/check.h"
28
29namespace operations_research {
30
31namespace ssmf_internal {
32
33template <typename T>
35
36template <typename T>
38
39} // namespace ssmf_internal
40
41// Space-Saving is an approximate algorithm for finding the most frequent items
42// in a data stream. It is conceptually very simple: we maintain a list of at
43// most `storage_size` elements and the number of times each of them has been
44// seen. When a new element is added and the list is full, we remove the least
45// frequent item (the one with the lowest count). If there is a tie, we remove
46// the oldest one. See space_saving_most_frequent_test.cc for a trivial
47// implementation that yield identical results to this class but is much slower.
48//
49// The implementation is based on [1], which describes a way of storing the
50// items so all the operations are O(1). The elements that have the same count
51// (a "bucket") are stored in a doubly-linked list, ordered by the time of
52// insertion. The buckets are also stored in a doubly-linked list, ordered by
53// number of counts. Thus, to increment the count of an element we need to
54// remove it from its bucket and add it to the next one, which is a removal and
55// an inclusion in linked lists and thus takes O(1) time.
56//
57// [1] Graham Cormode, Marios Hadjieleftheriou. Methods for finding frequent
58// items in data streams. The VLDB Journal (2010) 19: 3.
59// http://dimacs.rutgers.edu/~graham/pubs/papers/freqvldbj.pdf
60//
61// This class is thread-compatible.
62template <typename T, typename Hash = absl::Hash<T>,
63 typename Eq = std::equal_to<T>>
65 public:
66 // Create a data structure holding at most `storage_size` elements in memory.
67 // That means that frequent elements that are added less frequently than
68 // `1/storage_size` will be ignored.
69 explicit SpaceSavingMostFrequent(int storage_size);
70
72
73 // Adds `value` to the data structure.
74 // Complexity: O(1).
75 void Add(T value);
76
77 // Removes all occurrences of `value` from the data structure. Does nothing if
78 // the element is not in the data structure.
79 // Complexity: O(1).
80 void FullyRemove(const T& value);
81
82 // Returns the `num_samples` most frequent elements in the data structure
83 // sorted by decreasing count. Note: this does not work with non-copyable
84 // types.
85 // TODO(user): Replace this by an iterator with a begin() and end().
86 std::vector<std::pair<T, int64_t>> GetMostFrequent(int num_samples) const;
87
88 // Equivalent to calling GetMostFrequent(1) and popping the first element.
90
91 // Equivalent of GetMostFrequent(1).second. Returns zero if the data structure
92 // is empty.
93 int64_t CountOfMostFrequent() const;
94
95 private:
96 struct Bucket;
97
98 // The nodes of the doubly-linked list of elements for a given bucket (ie.,
99 // sharing the same count).
100 struct Item {
101 T value;
102 Bucket* absl_nonnull bucket;
103 Item* absl_nullable next = nullptr;
104 Item* absl_nullable prev = nullptr;
105 };
107
108 // A bucket of elements with the same count. They are stored in a
109 // doubly-linked list ordered by the time of insertion.
110 struct Bucket {
111 int64_t count; // The count of this bucket.
112 ItemList items; // front (oldest), back (newest).
113 Bucket* absl_nullable next = nullptr; // Bucket with lower count.
114 Bucket* absl_nullable prev = nullptr; // Bucket with higher count.
115 };
117
118 void RemoveIfEmpty(Bucket* absl_nonnull bucket) {
119 if (bucket->items.empty()) {
120 bucket_alloc_.Return(buckets_.erase(bucket));
121 }
122 }
123
124 void RemoveFromLinkedList(Item* absl_nonnull item) {
125 Bucket* absl_nonnull bucket = item->bucket;
126 item_alloc_.Return(bucket->items.erase(item));
127 RemoveIfEmpty(bucket);
128 }
129
130 Bucket* absl_nonnull GetBucketForCountOne() {
131 if (!buckets_.empty() && buckets_.back()->count == 1) {
132 return buckets_.back();
133 }
134 // We need to create a new empty bucket, which will be the last one.
135 Bucket* absl_nonnull bucket = buckets_.insert_back(bucket_alloc_.New());
136 bucket->count = 1;
137 return bucket;
138 }
139
140 const int storage_size_;
143 BucketList buckets_; // front with highest count.
144
145 struct HashItemPtr {
146 using is_transparent = void;
147 size_t operator()(const Item* absl_nonnull value) const {
148 return Hash()(value->value);
149 }
150 size_t operator()(const T& value) const { return Hash()(value); }
151 };
152
153 struct EqItemPtr {
154 using is_transparent = void;
155 bool operator()(const Item* absl_nonnull a,
156 const Item* absl_nonnull b) const {
157 return Eq()(a->value, b->value);
158 }
159 bool operator()(const Item* absl_nonnull a, const T& b) const {
160 return Eq()(a->value, b);
161 }
162 bool operator()(const T& a, const Item* absl_nonnull b) const {
163 return Eq()(a, b->value);
164 }
165 };
166 absl::flat_hash_set<Item* absl_nonnull, HashItemPtr, EqItemPtr> item_ptr_set_;
167};
168
169template <typename T, typename Hash, typename Eq>
171 : storage_size_(storage_size),
172 item_alloc_(storage_size),
173 bucket_alloc_(storage_size + 1) {
174 CHECK_GT(storage_size, 0);
175 item_ptr_set_.reserve(2 * storage_size);
176}
177
178// Properly return all buckets and items to their allocators to ensure proper
179// destruction.
180template <typename T, typename Hash, typename Eq>
182#ifdef NDEBUG
183 bucket_alloc_.DisposeAll();
184 item_alloc_.DisposeAll();
185#else
186 while (!buckets_.empty()) {
187 auto& items = buckets_.front()->items;
188 while (!items.empty()) {
189 item_alloc_.Return(items.pop_front());
190 }
191 bucket_alloc_.Return(buckets_.pop_front());
192 }
193#endif
194}
195
196template <typename T, typename Hash, typename Eq>
198 if (buckets_.empty()) {
199 // We are adding an element to an empty data structure.
200 DCHECK(item_alloc_.empty());
201 DCHECK(item_ptr_set_.empty());
202 Bucket* absl_nonnull bucket = buckets_.insert_back(bucket_alloc_.New());
203 Item* absl_nonnull const item =
204 bucket->items.insert_front(item_alloc_.New());
205 item->bucket = bucket;
206 item->value = std::move(value);
207 bucket->count = 1;
208 item_ptr_set_.emplace(item);
209 return;
210 }
211
212 DCHECK(!buckets_.empty());
213
214 auto it = item_ptr_set_.find(value);
215 if (it == item_ptr_set_.end()) {
216 // We are adding a new element. First, check if we are full, and if so,
217 // remove the least frequent element.
218 if (item_alloc_.full()) {
219 // Remove an entry from the last bucket where the `count` is lowest.
220 Bucket* absl_nonnull const last_bucket = buckets_.back();
221 // We want to remove the oldest one, with the idea that it is potentially
222 // the real least frequent of the bucket since it was unseen for longer.
223 Item* absl_nonnull recycled_item = last_bucket->items.front();
224 // Reclaim its storage for the newly added element.
225 item_ptr_set_.erase(recycled_item);
226 item_alloc_.Return(last_bucket->items.pop_front());
227 RemoveIfEmpty(last_bucket);
228 }
229 Bucket* absl_nonnull bucket = GetBucketForCountOne();
230 DCHECK_EQ(bucket->count, 1);
231 Item* absl_nonnull item = bucket->items.insert_back(item_alloc_.New());
232 item->value = std::move(value);
233 item->bucket = bucket;
234 item_ptr_set_.emplace_hint(it, item);
235 } else {
236 Item* absl_nonnull item = *it;
237 Bucket* absl_nonnull bucket = item->bucket;
238 ItemList& current_bucket_items = bucket->items;
239 const int64_t new_count = bucket->count + 1;
240 const bool no_bucket_for_new_count =
241 (bucket->prev == nullptr) || (bucket->prev->count > new_count);
242 if (no_bucket_for_new_count && current_bucket_items.single()) {
243 // Small optimization for very common elements: if the element is alone in
244 // a bucket and there is no bucket for count + 1, we can just increment
245 // the count of the bucket.
246 bucket->count = new_count;
247 return;
248 }
249 // Extract item from this bucket.
250 auto dangling_item = current_bucket_items.erase(item);
251 // Fetch the bucket with the correct count.
252 Bucket* new_bucket = nullptr;
253 if (bucket->prev && bucket->prev->count == new_count) {
254 new_bucket = bucket->prev;
255 } else {
256 // We create a new empty bucket, which will be before the current bucket.
257 new_bucket = buckets_.insert_before(bucket, bucket_alloc_.New());
258 new_bucket->count = new_count;
259 }
260 // Insert the item in the new bucket at the end (newest).
261 dangling_item->bucket = new_bucket;
262 new_bucket->items.insert_back(std::move(dangling_item));
263
264 // Reclaim old bucket if it is empty.
265 RemoveIfEmpty(bucket);
266 }
267}
268
269template <typename T, typename Hash, typename Eq>
271 auto node = item_ptr_set_.extract(value);
272 if (node.empty()) return;
273 RemoveFromLinkedList(node.value());
274}
275
276template <typename T, typename Hash, typename Eq>
277std::vector<std::pair<T, int64_t>>
279 std::vector<std::pair<T, int64_t>> result;
280 result.reserve(num_samples);
281 if (!buckets_.empty()) {
282 for (Bucket* bucket = buckets_.front(); bucket; bucket = bucket->next) {
283 const int64_t count = bucket->count;
284 DCHECK(!bucket->items.empty());
285 for (Item* item = bucket->items.back(); item; item = item->prev) {
286 if (result.size() == num_samples) return result;
287 result.emplace_back(item->value, count);
288 }
289 }
290 }
291 return result;
292}
293
294template <typename T, typename Hash, typename Eq>
296 CHECK(!buckets_.empty());
297 Item* absl_nonnull item = buckets_.front()->items.back();
298 DCHECK(item_ptr_set_.contains(item));
299 item_ptr_set_.erase(item);
300 T value = std::move(item->value);
301 RemoveFromLinkedList(item);
302 return value;
303}
304
305template <typename T, typename Hash, typename Eq>
307 return buckets_.empty() ? 0 : buckets_.front()->count;
308}
309
310namespace ssmf_internal {
311
312// This is semantically equivalent to a `std::unique_ptr` except that it does
313// not own the object and that only `BoundedAllocator` and `DoubleLinkedList`
314// are able to manage the stored pointer.
315// Other clients can access the object with the guarantee that it is non null.
316template <typename T>
317class Ptr {
318 explicit Ptr(T* absl_nullable ptr) : ptr_(ptr) { get_nonnull(); }
319
320 T* absl_nonnull get_nonnull() const {
321 DCHECK(ptr_ != nullptr);
322 return ptr_;
323 };
324
325 T* absl_nonnull release() {
326 T* absl_nonnull ptr = get_nonnull();
327 ptr_ = nullptr;
328 return ptr;
329 }
330
331 T* absl_nullable ptr_ = nullptr;
332
333 public:
334 Ptr(const Ptr&) = delete;
335 Ptr(Ptr&& other) : ptr_(other.release()) {}
336 ~Ptr() { DCHECK(ptr_ == nullptr); }
337 T* absl_nonnull operator->() const { return get_nonnull(); };
338 T& operator*() const { return *get_nonnull(); };
339 friend class BoundedAllocator<T>;
340 friend class DoubleLinkedList<T>;
341};
342
343// Allocator that allows creating up to `max_size` objects.
344// Storage is allocated at once contiguously which helps with cache locality.
345// Objects that are returned to the allocator are stored in a freelist for later
346// use and are not destroyed right away. Objects that are extracted from the
347// freelist are default initialized for correctness.
348// The allocator makes sure that all created objects are returned to the pool
349// upon destruction, this allows to catch logic errors. It is possible to bypass
350// this behavior when it is safe to destroy all object at once by calling the
351// `DisposeAll` method. Once this method is called the allocator cannot be used
352// anymore.
353template <typename T>
355 public:
356 explicit BoundedAllocator(size_t max_size) : data_(max_size) {
357 freelist_.reserve(max_size);
358 for (auto& data : data_) {
359 freelist_.push_back(&data);
360 }
361 }
362
364 CHECK(empty()) << "some elements are not returned and won't be destroyed.";
365 }
366
367 bool full() const { return freelist_.empty(); }
368
369 bool empty() const { return data_.size() == freelist_.size(); }
370
372 CHECK(!freelist_.empty());
373 T* absl_nonnull t = freelist_.back();
374 freelist_.pop_back();
375 return Ptr<T>(t);
376 }
377
378 void Return(Ptr<T> ptr) {
379 T* absl_nonnull t = ptr.release();
380 DCHECK(t != nullptr);
381 DCHECK_GE(t, &data_.front());
382 DCHECK_LE(t, &data_.back());
383 *t = T();
384 freelist_.push_back(t);
385 }
386
387 // Destroys all allocated objects.
388 // Once called, it is no more possible to allocate new objects.
389 void DisposeAll() {
390 freelist_.clear();
391 data_.clear();
392 }
393
394 private:
395 std::vector<T> data_;
396 std::vector<T* absl_nonnull> freelist_;
397};
398
399// A simple doubly linked list with ownership transfer.
400// All elements added or extracted from the list are done though the `Ptr`
401// abstraction. This guarantees that there is always only one owner.
402template <typename T>
404 public:
405 DoubleLinkedList() = default;
408 DCHECK_EQ(front_, nullptr);
409 DCHECK_EQ(front_, nullptr);
410 }
411
412 bool empty() const {
413 DCHECK_EQ(front_ == nullptr, back_ == nullptr);
414 return front_ == nullptr;
415 }
416
417 bool single() const {
418 DCHECK_EQ(front_ == nullptr, back_ == nullptr);
419 return front_ != nullptr && front_ == back_;
420 }
421
422 T* absl_nonnull front() const {
423 DCHECK_NE(front_, nullptr);
424 return front_;
425 }
426
427 T* absl_nonnull back() const {
428 DCHECK_NE(back_, nullptr);
429 return back_;
430 }
431
432 T* absl_nonnull insert_after(T* absl_nonnull node, Ptr<T> new_node) {
433 T* absl_nonnull new_node_ptr = new_node.release();
434 new_node_ptr->prev = node;
435 if (node->next == nullptr) {
436 DCHECK_EQ(new_node_ptr->next, nullptr);
437 back_ = new_node_ptr;
438 } else {
439 new_node_ptr->next = node->next;
440 node->next->prev = new_node_ptr;
441 }
442 node->next = new_node_ptr;
443 return new_node_ptr;
444 }
445
446 T* absl_nonnull insert_before(T* absl_nonnull node, Ptr<T> new_node) {
447 T* absl_nonnull new_node_ptr = new_node.release();
448 new_node_ptr->next = node;
449 if (node->prev == nullptr) {
450 DCHECK_EQ(new_node_ptr->prev, nullptr);
451 front_ = new_node_ptr;
452 } else {
453 new_node_ptr->prev = node->prev;
454 node->prev->next = new_node_ptr;
455 }
456 node->prev = new_node_ptr;
457 return new_node_ptr;
458 }
459
460 T* absl_nonnull insert_front(Ptr<T> new_node) {
461 if (front_ != nullptr) {
462 return insert_before(front_, std::move(new_node));
463 }
464 T* absl_nonnull new_node_ptr = new_node.release();
465 front_ = new_node_ptr;
466 back_ = new_node_ptr;
467 new_node_ptr->prev = nullptr;
468 new_node_ptr->next = nullptr;
469 return new_node_ptr;
470 }
471
472 T* absl_nonnull insert_back(Ptr<T> new_node) {
473 if (back_ != nullptr) {
474 return insert_after(back_, std::move(new_node));
475 } else {
476 return insert_front(std::move(new_node));
477 }
478 }
479
480 ABSL_MUST_USE_RESULT Ptr<T> erase(T* absl_nonnull node) {
481 if (node->prev) {
482 node->prev->next = node->next;
483 } else {
484 front_ = node->next;
485 }
486 if (node->next) {
487 node->next->prev = node->prev;
488 } else {
489 back_ = node->prev;
490 }
491 node->next = nullptr;
492 node->prev = nullptr;
493 return Ptr<T>(node);
494 }
495
496 ABSL_MUST_USE_RESULT Ptr<T> pop_front() { return erase(front()); }
497
498 ABSL_MUST_USE_RESULT Ptr<T> pop_back() { return erase(back()); }
499
500 private:
501 T* absl_nullable front_ = nullptr;
502 T* absl_nullable back_ = nullptr;
503};
504} // namespace ssmf_internal
505
506} // namespace operations_research
507
508#endif // ORTOOLS_ALGORITHMS_SPACE_SAVING_MOST_FREQUENT_H_
std::vector< std::pair< T, int64_t > > GetMostFrequent(int num_samples) const
ABSL_MUST_USE_RESULT Ptr< T > erase(T *absl_nonnull node)
T *absl_nonnull insert_after(T *absl_nonnull node, Ptr< T > new_node)
DoubleLinkedList(const DoubleLinkedList &)=delete
T *absl_nonnull insert_before(T *absl_nonnull node, Ptr< T > new_node)
OR-Tools root namespace.