Google OR-Tools v9.11
a fast and portable software suite for combinatorial optimization
Loading...
Searching...
No Matches
duplicate_remover.h
Go to the documentation of this file.
1// Copyright 2010-2024 Google LLC
2// Licensed under the Apache License, Version 2.0 (the "License");
3// you may not use this file except in compliance with the License.
4// You may obtain a copy of the License at
5//
6// http://www.apache.org/licenses/LICENSE-2.0
7//
8// Unless required by applicable law or agreed to in writing, software
9// distributed under the License is distributed on an "AS IS" BASIS,
10// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11// See the License for the specific language governing permissions and
12// limitations under the License.
13
14#ifndef OR_TOOLS_ALGORITHMS_DUPLICATE_REMOVER_H_
15#define OR_TOOLS_ALGORITHMS_DUPLICATE_REMOVER_H_
16
17#include <cstddef>
18#include <cstdint>
19#include <vector>
20
21#include "absl/log/check.h"
22#include "absl/numeric/bits.h"
23#include "absl/random/distributions.h"
24#include "absl/random/random.h"
25#include "absl/types/span.h"
26#include "google/protobuf/repeated_field.h"
27
28namespace operations_research {
29
30// This class offers an alternative to gtl::linked_hash_set<> which is:
31// - stateless: it works directly on a vector<int> or any similar container,
32// without storing extra data anywhere;
33// - faster when the number of unique values is 5K or above.
34//
35// The memory usage can be O(num_distinct_values) at any time if you use
36// AppendAndLazilyRemoveDuplicates(). In fact, unit tests verify that the
37// average number of elements kept is ≤ 1.5 * num_distinct_values, making
38// it comparable to a flat_hash_set<int> (whose overhead factor is ~1.68).
39//
40// Usage pattern:
41//
42// // One instance of this can handle many sets on the same [0, n) domain.
43// int N = 100'000;
44// DenseIntDuplicateRemover deduper(N); // Uses N/8 bytes of memory.
45// std::vector<int> values; // Your container. Could be RepeatedField<int>.
46// for (int x : ...) {
47// deduper.AppendAndLazilyRemoveDuplicates(x, &values); // O(1) amortized.
48// }
49// deduper.RemoveDuplicates(&values); // O(values.size())
50//
52 public:
54 : n_(n),
55 tmp_mask_storage_((n + 7) / 8, 0),
56 tmp_mask_(tmp_mask_storage_) {}
57
58 template <class IntContainer>
59 void RemoveDuplicates(IntContainer* container);
60
61 template <class IntContainer>
62 void AppendAndLazilyRemoveDuplicates(int x, IntContainer* container);
63
64 private:
65 template <class IntContainer>
66 void Append(int x, IntContainer* container);
67
68 template <class IntContainer>
69 void Truncate(size_t new_size, IntContainer* container);
70
71 size_t RemoveDuplicatesInternal(absl::Span<int> span);
72
73 absl::BitGen random_;
74 const int n_;
75 std::vector<uint8_t> tmp_mask_storage_;
76 const absl::Span<uint8_t> tmp_mask_;
77};
78
79// _____________________________________________________________________________
80// Implementation of the templates.
81
82template <class IntContainer>
83void DenseIntDuplicateRemover::RemoveDuplicates(IntContainer* container) {
84 const size_t new_size = RemoveDuplicatesInternal(absl::MakeSpan(*container));
85 Truncate(new_size, container);
86}
87
88template <class IntContainer>
90 int x, IntContainer* container) {
91 DCHECK_GE(x, 0);
92 DCHECK_LT(x, n_);
93 Append(x, container);
94 // ALGORITHM:
95 // In order to remain stateless, yet call RemoveDuplicates() often enough
96 // that the size of the container remains O(num_distinct_elements), but not
97 // too often since we must remain O(1) time amortized, we randomize:
98 // every time we append an element, we'll call RemoveDuplicates() with
99 // probability 1/k, where k is the current size of the container.
100 // That way, the added expected complexity is O(k)*1/k = O(1), yet we know
101 // that we'll eventually call it. See the unit tests that verify the claims.
102 // As an important optimization, since drawing the pseudo-random number is
103 // expensive, we only perform it every kCheckPeriod, and to compensate we
104 // multiply the probability by the same amount.
105 constexpr int kCheckPeriod = 8;
106 static_assert(absl::popcount(unsigned(kCheckPeriod)) == 1,
107 "must be power of two");
108 const size_t size = container->size();
109 if (size & (kCheckPeriod - 1)) return;
110 if (size >= 2 * n_ ||
111 absl::Uniform<size_t>(random_, 0, container->size()) < kCheckPeriod) {
112 RemoveDuplicates(container);
113 }
114}
115
116template <>
117inline void DenseIntDuplicateRemover::Append(int x,
118 std::vector<int>* container) {
119 container->push_back(x);
120}
121
122template <>
123inline void DenseIntDuplicateRemover::Append(
124 int x, google::protobuf::RepeatedField<int>* container) {
125 container->Add(x);
126}
127
128template <>
129inline void DenseIntDuplicateRemover::Truncate(size_t new_size,
130 std::vector<int>* container) {
131 container->resize(new_size);
132}
133
134template <>
135inline void DenseIntDuplicateRemover::Truncate(
136 size_t new_size, google::protobuf::RepeatedField<int>* container) {
137 container->Truncate(new_size);
138}
139
140} // namespace operations_research
141
142#endif // OR_TOOLS_ALGORITHMS_DUPLICATE_REMOVER_H_
IntegerValue size
void RemoveDuplicates(IntContainer *container)
void AppendAndLazilyRemoveDuplicates(int x, IntContainer *container)
In SWIG mode, we don't want anything besides these top-level includes.
const Variable x
Definition qp_tests.cc:127