Google OR-Tools v9.14
a fast and portable software suite for combinatorial optimization
Loading...
Searching...
No Matches
set_cover_model.cc
Go to the documentation of this file.
1// Copyright 2010-2025 Google LLC
2// Licensed under the Apache License, Version 2.0 (the "License");
3// you may not use this file except in compliance with the License.
4// You may obtain a copy of the License at
5//
6// http://www.apache.org/licenses/LICENSE-2.0
7//
8// Unless required by applicable law or agreed to in writing, software
9// distributed under the License is distributed on an "AS IS" BASIS,
10// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11// See the License for the specific language governing permissions and
12// limitations under the License.
13
15
16#include <algorithm>
17#include <cmath>
18#include <cstddef>
19#include <cstdint>
20#include <limits>
21#include <numeric>
22#include <string>
23#include <tuple>
24#include <utility>
25#include <vector>
26
27#include "absl/log/check.h"
28#include "absl/log/log.h"
29#include "absl/numeric/bits.h"
30#include "absl/random/discrete_distribution.h"
31#include "absl/random/distributions.h"
32#include "absl/random/random.h"
33#include "absl/strings/str_format.h"
34#include "absl/strings/str_join.h"
35#include "absl/strings/string_view.h"
36#include "absl/types/span.h"
39#include "ortools/set_cover/set_cover.pb.h"
40
41namespace operations_research {
42
43namespace {
44
45// Returns a value in [min, min + scaling_factor * (raw_value - min +
46// random_term)], where raw_value is drawn from a discrete distribution, and
47// random_term is a double drawn uniformly in [0, 1].
48BaseInt DiscreteAffine(absl::BitGen& bitgen,
49 absl::discrete_distribution<BaseInt>& dist, BaseInt min,
50 double scaling_factor) {
51 const BaseInt raw_value = dist(bitgen);
52 const double random_term = absl::Uniform<double>(bitgen, 0, 1.0);
53 const BaseInt affine_value =
54 static_cast<BaseInt>(
55 std::floor((raw_value - min + random_term) * scaling_factor)) +
56 min;
57 return affine_value;
58}
59
60// For a given view (SparseColumnView or SparseRowView), returns the
61// distribution of the sizes of the vectors in the view, which can be used in
62// an absl::discrete_distribution.
63template <typename View>
64std::tuple<BaseInt, std::vector<BaseInt>> ComputeSizeHistogram(
65 const View& view) {
66 BaseInt max_size = 0;
67 BaseInt min_size = std::numeric_limits<BaseInt>::max();
68 for (const auto& vec : view) {
69 const BaseInt size = vec.size();
70 min_size = std::min(min_size, size);
71 max_size = std::max(max_size, size);
72 }
73 std::vector<BaseInt> weights(max_size + 1, 0);
74 for (const auto& vec : view) {
75 const BaseInt size = vec.size();
76 ++weights[size];
77 }
78 return {min_size, weights};
79}
80
81template <typename View>
82std::tuple<BaseInt, absl::discrete_distribution<BaseInt>>
83ComputeSizeDistribution(const View& view) {
84 const auto [min_size, weights] = ComputeSizeHistogram(view);
85 absl::discrete_distribution<BaseInt> dist(weights.begin(), weights.end());
86 return {min_size, dist};
87}
88} // namespace
89
92 double row_scale, double column_scale, double cost_scale) {
93 SetCoverModel model;
94 StopWatch stop_watch(&(model.generation_duration_));
95 DCHECK_GT(row_scale, 0.0);
96 DCHECK_GT(column_scale, 0.0);
97 DCHECK_GT(cost_scale, 0.0);
98 model.num_elements_ = num_elements;
99 model.num_nonzeros_ = 0;
101 model.UpdateAllSubsetsList();
102 absl::BitGen bitgen;
103
104 // Create the distribution of the cardinalities of the subsets based on the
105 // histogram of column sizes in the seed model.
106 auto [min_column_size, column_dist] =
107 ComputeSizeDistribution(seed_model.columns());
108
109 // Create the distribution of the degrees of the elements based on the
110 // histogram of row sizes in the seed model.
111 auto [min_row_size, row_dist] = ComputeSizeDistribution(seed_model.rows());
112
113 // Prepare the degrees of the elements in the generated model, and use them
114 // in a distribution to generate the columns. This ponderates the columns
115 // towards the elements with higher degrees. ???
117 for (ElementIndex element(0); element.value() < num_elements; ++element) {
118 degrees[element] =
119 DiscreteAffine(bitgen, row_dist, min_row_size, row_scale);
120 }
121 absl::discrete_distribution<BaseInt> degree_dist(degrees.begin(),
122 degrees.end());
123
124 // Vector indicating whether the generated model covers an element.
125 ElementBoolVector contains_element(num_elements, false);
126
127 // Number of elements in the generated model, using the above vector.
128 BaseInt num_elements_covered(0);
129
130 // Maximum number of tries to generate a random element that is not yet in
131 // the subset, or a random subset that does not contain the element.
132 const int kMaxTries = 10;
133
134 // Loop-local vector indicating whether the currently generated subset
135 // contains an element.
136 ElementBoolVector subset_already_contains_element(num_elements, false);
137 for (SubsetIndex subset(0); subset.value() < num_subsets; ++subset) {
138 LOG_EVERY_N_SEC(INFO, 5)
139 << absl::StrFormat("Generating subset %d (%.1f%%)", subset.value(),
140 100.0 * subset.value() / num_subsets);
141 const BaseInt cardinality =
142 DiscreteAffine(bitgen, column_dist, min_column_size, column_scale);
143 model.columns_[subset].reserve(cardinality);
144 for (BaseInt iter = 0; iter < cardinality; ++iter) {
145 int num_tries = 0;
146 ElementIndex element;
147 // Choose an element that is not yet in the subset at random with a
148 // distribution that is proportional to the degree of the element.
149 do {
150 element = ElementIndex(degree_dist(bitgen));
151 CHECK_LT(element.value(), num_elements);
152 ++num_tries;
153 } while (num_tries < kMaxTries &&
154 subset_already_contains_element[element]);
155 ++model.num_nonzeros_;
156 model.columns_[subset].push_back(element);
157 subset_already_contains_element[element] = true;
158 if (!contains_element[element]) {
159 contains_element[element] = true;
160 ++num_elements_covered;
161 }
162 }
163 for (const ElementIndex element : model.columns_[subset]) {
164 subset_already_contains_element[element] = false;
165 }
166 }
167 VLOG(1) << "Finished generating the model with " << num_elements_covered
168 << " elements covered.";
169
170 // It can happen -- rarely in practice -- that some of the elements cannot be
171 // covered. Let's add them to randomly chosen subsets.
172 if (num_elements_covered != num_elements) {
173 VLOG(1) << "Generated model with " << num_elements - num_elements_covered
174 << " elements that cannot be covered. Adding them to random "
175 "subsets.";
176 SubsetBoolVector element_already_in_subset(num_subsets, false);
177 for (ElementIndex element(0); element.value() < num_elements; ++element) {
178 VLOG_EVERY_N_SEC(1, 5) << absl::StrFormat(
179 "Generating subsets for element %d (%.1f%%)", element.value(),
180 100.0 * element.value() / num_elements);
181 if (!contains_element[element]) {
182 const BaseInt degree =
183 DiscreteAffine(bitgen, row_dist, min_row_size, row_scale);
184 std::vector<SubsetIndex> generated_subsets;
185 generated_subsets.reserve(degree);
186 for (BaseInt i = 0; i < degree; ++i) {
187 int num_tries = 0;
188 SubsetIndex subset_index;
189 // The method is the same as above.
190 do {
191 subset_index = SubsetIndex(DiscreteAffine(
192 bitgen, column_dist, min_column_size, column_scale));
193 ++num_tries;
194 } while (num_tries < kMaxTries &&
195 element_already_in_subset[subset_index]);
196 ++model.num_nonzeros_;
197 model.columns_[subset_index].push_back(element);
198 element_already_in_subset[subset_index] = true;
199 generated_subsets.push_back(subset_index);
200 }
201 for (const SubsetIndex subset_index : generated_subsets) {
202 element_already_in_subset[subset_index] = false;
203 }
204 contains_element[element] = true;
205 ++num_elements_covered;
206 }
207 }
208 VLOG(1) << "Finished generating subsets for elements that were not "
209 "covered in the original model.";
210 }
211 VLOG(1) << "Finished generating the model. There are "
212 << num_elements - num_elements_covered << " uncovered elements.";
213
214 CHECK_EQ(num_elements_covered, num_elements);
215
216 // TODO(user): if necessary, use a better distribution for the costs.
217 // The generation of the costs is done in two steps. First, compute the
218 // minimum and maximum costs.
219 Cost min_cost = std::numeric_limits<Cost>::infinity();
220 Cost max_cost = -min_cost;
221 for (const Cost cost : seed_model.subset_costs()) {
222 min_cost = std::min(min_cost, cost);
223 max_cost = std::max(max_cost, cost);
224 }
225 // Then, generate random numbers in [min_cost, min_cost + cost_range], where
226 // cost_range is defined as:
227 const Cost cost_range = cost_scale * (max_cost - min_cost);
228 for (Cost& cost : model.subset_costs_) {
229 cost = min_cost + absl::Uniform<double>(bitgen, 0, cost_range);
230 }
231 model.CreateSparseRowView();
232 return model;
233}
234
235void SetCoverModel::UpdateAllSubsetsList() {
236 const BaseInt old_size = all_subsets_.size();
237 DCHECK_LE(old_size, num_subsets());
238 all_subsets_.resize(num_subsets());
239 for (BaseInt subset(old_size); subset < num_subsets(); ++subset) {
240 all_subsets_[subset] = SubsetIndex(subset);
241 }
242}
243
245 // TODO(user): refine the logic for is_unicost_ and is_unicost_valid_.
246 is_unicost_valid_ = false;
247 elements_in_subsets_are_sorted_ = false;
248 subset_costs_.push_back(cost);
249 columns_.push_back(SparseColumn());
250 all_subsets_.push_back(SubsetIndex(num_subsets_));
251 ++num_subsets_;
252 CHECK_EQ(columns_.size(), num_subsets());
253 CHECK_EQ(subset_costs_.size(), num_subsets());
254 CHECK_EQ(all_subsets_.size(), num_subsets());
255 row_view_is_valid_ = false;
256}
257
259 elements_in_subsets_are_sorted_ = false;
260 columns_.back().push_back(ElementIndex(element));
261 num_elements_ = std::max(num_elements_, element + 1);
262 // No need to update the list all_subsets_.
263 ++num_nonzeros_;
264 row_view_is_valid_ = false;
265}
266
267void SetCoverModel::AddElementToLastSubset(ElementIndex element) {
268 AddElementToLastSubset(element.value());
269}
270
272 // TODO(user): refine the logic for is_unicost_ and is_unicost_valid_.
273 // NOMUTANTS -- this is a performance optimization.
274 is_unicost_valid_ = false;
275 elements_in_subsets_are_sorted_ = false;
276 CHECK(std::isfinite(cost));
277 DCHECK_GE(subset, 0);
278 if (subset >= num_subsets()) {
279 num_subsets_ = std::max(num_subsets_, subset + 1);
280 columns_.resize(num_subsets_, SparseColumn());
281 subset_costs_.resize(num_subsets_, 0.0);
282 row_view_is_valid_ = false;
283 UpdateAllSubsetsList();
284 }
285 subset_costs_[SubsetIndex(subset)] = cost;
286}
287
288void SetCoverModel::SetSubsetCost(SubsetIndex subset, Cost cost) {
289 SetSubsetCost(subset.value(), cost);
290}
291
293 elements_in_subsets_are_sorted_ = false;
294 if (subset >= num_subsets()) {
295 num_subsets_ = subset + 1;
296 subset_costs_.resize(num_subsets_, 0.0);
297 columns_.resize(num_subsets_, SparseColumn());
298 UpdateAllSubsetsList();
299 }
300 columns_[SubsetIndex(subset)].push_back(ElementIndex(element));
301 num_elements_ = std::max(num_elements_, element + 1);
302 ++num_nonzeros_;
303 row_view_is_valid_ = false;
304}
305
306void SetCoverModel::AddElementToSubset(ElementIndex element,
307 SubsetIndex subset) {
308 AddElementToSubset(element.value(), subset.value());
309}
310
312 num_subsets_ = std::max(num_subsets_, num_subsets);
313 columns_.resize(num_subsets_, SparseColumn());
314 subset_costs_.resize(num_subsets_, 0.0);
315 UpdateAllSubsetsList();
316}
317
321
323 BaseInt subset) {
324 ResizeNumSubsets(subset);
325 columns_[SubsetIndex(subset)].reserve(ColumnEntryIndex(num_elements));
326}
327
329 for (const SubsetIndex subset : SubsetRange()) {
330 // std::sort(columns_[subset].begin(), columns_[subset].end());
331 BaseInt* data = reinterpret_cast<BaseInt*>(columns_[subset].data());
332 RadixSort(absl::MakeSpan(data, columns_[subset].size()));
333 }
334 elements_in_subsets_are_sorted_ = true;
335}
336
338 StopWatch stop_watch(&create_sparse_row_view_duration_);
339 if (row_view_is_valid_) {
340 VLOG(1) << "CreateSparseRowView: already valid";
341 return;
342 }
343 VLOG(1) << "CreateSparseRowView started";
344 rows_.resize(num_elements_, SparseRow());
345 ElementToIntVector row_sizes(num_elements_, 0);
346 for (const SubsetIndex subset : SubsetRange()) {
347 BaseInt* data = reinterpret_cast<BaseInt*>(columns_[subset].data());
348 RadixSort(absl::MakeSpan(data, columns_[subset].size()));
349
350 ElementIndex preceding_element(-1);
351 for (const ElementIndex element : columns_[subset]) {
352 CHECK_GT(element, preceding_element)
353 << "Repetition in column "
354 << subset; // Fail if there is a repetition.
355 ++row_sizes[element];
356 preceding_element = element;
357 }
358 }
359 for (const ElementIndex element : ElementRange()) {
360 rows_[element].reserve(RowEntryIndex(row_sizes[element]));
361 }
362 for (const SubsetIndex subset : SubsetRange()) {
363 for (const ElementIndex element : columns_[subset]) {
364 rows_[element].emplace_back(subset);
365 }
366 }
367 row_view_is_valid_ = true;
368 elements_in_subsets_are_sorted_ = true;
369 VLOG(1) << "CreateSparseRowView finished";
370}
371
372std::vector<SubsetIndex> SetCoverModel::ComputeSliceIndices(
373 int num_partitions) {
374 if (num_partitions <= 1 || columns_.empty()) {
375 return {SubsetIndex(columns_.size())};
376 }
377
378 std::vector<BaseInt> partial_sum_nnz(columns_.size());
379 partial_sum_nnz[0] = columns_[SubsetIndex(0)].size();
380 for (BaseInt col = 1; col < columns_.size(); ++col) {
381 partial_sum_nnz[col] =
382 partial_sum_nnz[col - 1] + columns_[SubsetIndex(col)].size();
383 }
384 const BaseInt total_nnz = partial_sum_nnz.back();
385 const BaseInt target_nnz = (total_nnz + num_partitions - 1) / num_partitions;
386
387 std::vector<SubsetIndex> partition_points(num_partitions);
388 BaseInt threshold = target_nnz;
389 BaseInt pos = 0;
390 for (const SubsetIndex col : SubsetRange()) {
391 if (partial_sum_nnz[col.value()] >= threshold) {
392 partition_points[pos] = col;
393 ++pos;
394 threshold += target_nnz;
395 }
396 }
397 partition_points[num_partitions - 1] = SubsetIndex(columns_.size());
398 return partition_points;
399}
400
402 SubsetIndex end_subset) {
404 rows.reserve(num_elements_);
405 ElementToIntVector row_sizes(num_elements_, 0);
406 for (SubsetIndex subset = begin_subset; subset < end_subset; ++subset) {
407 BaseInt* data = reinterpret_cast<BaseInt*>(columns_[subset].data());
408 RadixSort(absl::MakeSpan(data, columns_[subset].size()));
409
410 ElementIndex preceding_element(-1);
411 for (const ElementIndex element : columns_[subset]) {
412 CHECK_GT(element, preceding_element)
413 << "Repetition in column "
414 << subset; // Fail if there is a repetition.
415 ++row_sizes[element];
416 preceding_element = element;
417 }
418 }
419 for (const ElementIndex element : ElementRange()) {
420 rows[element].reserve(RowEntryIndex(row_sizes[element]));
421 }
422 for (SubsetIndex subset = begin_subset; subset < end_subset; ++subset) {
423 for (const ElementIndex element : columns_[subset]) {
424 rows[element].emplace_back(subset);
425 }
426 }
427 return rows;
428}
429
431 absl::Span<const SubsetIndex> partition_points) {
432 std::vector<SparseRowView> row_views;
433 row_views.reserve(partition_points.size());
434 SubsetIndex begin_subset(0);
435 // This should be done in parallel. It is a bottleneck.
436 for (SubsetIndex end_subset : partition_points) {
437 row_views.push_back(ComputeSparseRowViewSlice(begin_subset, end_subset));
438 begin_subset = end_subset;
439 }
440 return row_views;
441}
442
444 absl::Span<const SparseRowView> slices) {
445 SparseRowView result_rows;
446 // This is not a ReduceTree. This will be done later through parallelism.
447 result_rows.reserve(num_elements_);
448 ElementToIntVector row_sizes(num_elements_, 0);
449 for (const SparseRowView& slice : slices) {
450 // This should done as a reduce tree, in parallel.
451 for (const ElementIndex element : ElementRange()) {
452 result_rows[element].insert(result_rows[element].end(),
453 slice[element].begin(), slice[element].end());
454 }
455 }
456 return result_rows;
457}
458
460 StopWatch stop_watch(&compute_sparse_row_view_using_slices_duration_);
462 VLOG(1) << "CreateSparseRowViewUsingSlices started";
463 const std::vector<SubsetIndex> partition_points =
465 const std::vector<SparseRowView> slices =
466 CutSparseRowViewInSlices(partition_points);
468 VLOG(1) << "CreateSparseRowViewUsingSlices finished";
469 return rows;
470}
471
473 StopWatch stop_watch(&feasibility_duration_);
474 CHECK_GT(num_elements(), 0);
475 CHECK_GT(num_subsets(), 0);
476 CHECK_EQ(columns_.size(), num_subsets());
477 CHECK_EQ(subset_costs_.size(), num_subsets());
478 CHECK_EQ(all_subsets_.size(), num_subsets());
479 for (const Cost cost : subset_costs_) {
480 CHECK_GT(cost, 0.0);
481 }
482 ElementToIntVector possible_coverage(num_elements_, 0);
483 SubsetIndex column_index(0);
484 for (const SparseColumn& column : columns_) {
485 DLOG_IF(INFO, column.empty()) << "Empty column " << column_index.value();
486 for (const ElementIndex element : column) {
487 ++possible_coverage[element];
488 }
489 // NOMUTANTS -- column_index is only used for logging in debug mode.
490 ++column_index;
491 }
492 int64_t num_uncoverable_elements = 0;
493 for (const ElementIndex element : ElementRange()) {
494 // NOMUTANTS -- num_uncoverable_elements is only used for logging.
495 if (possible_coverage[element] == 0) {
496 ++num_uncoverable_elements;
497 }
498 }
499 VLOG(1) << "num_uncoverable_elements = " << num_uncoverable_elements;
500 for (const ElementIndex element : ElementRange()) {
501 if (possible_coverage[element] > 0) continue;
502 LOG(ERROR) << "Element " << element << " is not covered.";
503 return false;
504 }
505 VLOG(1) << "Max possible coverage = "
506 << *std::max_element(possible_coverage.begin(),
507 possible_coverage.end());
508 for (const SubsetIndex subset : SubsetRange()) {
509 if (all_subsets_[subset.value()] == subset) continue;
510 LOG(ERROR) << "subset = " << subset << " all_subsets_[subset.value()] = "
511 << all_subsets_[subset.value()];
512 return false;
513 }
514 return true;
515}
516
518 CHECK(elements_in_subsets_are_sorted_);
519 SetCoverProto message;
520 for (const SubsetIndex subset : SubsetRange()) {
521 VLOG_EVERY_N_SEC(1, 5) << absl::StrFormat(
522 "Exporting subset %d (%.1f%%)", subset.value(),
523 100.0 * subset.value() / num_subsets());
524 SetCoverProto::Subset* subset_proto = message.add_subset();
525 subset_proto->set_cost(subset_costs_[subset]);
526 SparseColumn column = columns_[subset]; // Copy is intentional.
527 BaseInt* data = reinterpret_cast<BaseInt*>(column.data());
528 RadixSort(absl::MakeSpan(data, column.size()));
529 for (const ElementIndex element : column) {
530 subset_proto->add_element(element.value());
531 }
532 }
533 VLOG(1) << "Finished exporting the model.";
534 return message;
535}
536
537void SetCoverModel::ImportModelFromProto(const SetCoverProto& message) {
538 columns_.clear();
539 subset_costs_.clear();
540 ResizeNumSubsets(message.subset_size());
541 SubsetIndex subset_index(0);
542 for (const SetCoverProto::Subset& subset_proto : message.subset()) {
543 subset_costs_[subset_index] = subset_proto.cost();
544 if (subset_proto.element_size() > 0) {
545 columns_[subset_index].reserve(
546 ColumnEntryIndex(subset_proto.element_size()));
547 for (const BaseInt element : subset_proto.element()) {
548 columns_[subset_index].push_back(ElementIndex(element));
549 num_elements_ = std::max(num_elements_, element + 1);
550 }
551 ++subset_index;
552 }
553 }
554 UpdateAllSubsetsList();
556}
557
558std::string SetCoverModel::ToVerboseString(absl::string_view sep) const {
559 return absl::StrJoin(
560 std::make_tuple("num_elements", num_elements(), "num_subsets",
561 num_subsets(), "num_nonzeros", num_nonzeros(),
562 "fill_rate", FillRate()),
563 sep);
564}
565
566std::string SetCoverModel::ToString(absl::string_view sep) const {
567 return absl::StrJoin(std::make_tuple(num_elements(), num_subsets(),
569 sep);
570}
571
572namespace {
573// Returns the standard deviation of the vector v, excluding those values that
574// are zero.
575template <typename T>
576double StandardDeviation(const std::vector<T>& values) {
577 const size_t size = values.size();
578 double n = 0.0; // n is used in a calculation involving doubles.
579 double sum_of_squares = 0.0;
580 double sum = 0.0;
581 for (size_t i = 0; i < size; ++i) {
582 double sample = static_cast<double>(values[i]);
583 if (sample == 0.0) continue;
584 sum_of_squares += sample * sample;
585 sum += sample;
586 ++n;
587 }
588 // Since we know all the values, we can compute the standard deviation
589 // exactly.
590 return n == 0.0 ? 0.0 : sqrt((sum_of_squares - sum * sum / n) / n);
591}
592
593// Statistics accumulation class used to compute statistics on the deltas of
594// the row and column elements and their sizes in bytes.
595// Since the values are not all stored, it's not possible to compute the median
596// exactly. It is returned as 0.0. NaN would be a better choice, but it's just
597// not a good idea as NaNs can propagate and cause problems.
598class StatsAccumulator {
599 public:
600 StatsAccumulator()
601 : count_(0),
602 min_(kInfinity),
603 max_(-kInfinity),
604 sum_(0.0),
605 sum_of_squares_(0.0) {}
606
607 void Register(double value) {
608 ++count_;
609 min_ = std::min(min_, value);
610 max_ = std::max(max_, value);
611 sum_ += value;
612 sum_of_squares_ += value * value;
613 }
614
615 SetCoverModel::Stats ComputeStats() const {
616 const int64_t n = count_;
617 // Since the code is used on a known number of values, we can compute the
618 // standard deviation exactly, even if the values are not all stored.
619 const double stddev =
620 n == 0 ? 0.0 : sqrt((sum_of_squares_ - sum_ * sum_ / n) / n);
621 return SetCoverModel::Stats{min_, max_, 0.0, sum_ / n, stddev};
622 }
623
624 private:
625 static constexpr double kInfinity = std::numeric_limits<double>::infinity();
626 int64_t count_;
627 double min_;
628 double max_;
629 double sum_;
630 double sum_of_squares_;
631};
632} // namespace
633
634template <typename T>
635SetCoverModel::Stats ComputeStats(std::vector<T> samples) {
637 stats.min = *std::min_element(samples.begin(), samples.end());
638 stats.max = *std::max_element(samples.begin(), samples.end());
639 stats.mean =
640 std::accumulate(samples.begin(), samples.end(), 0.0) / samples.size();
641 auto const q1 = samples.size() / 4;
642 auto const q2 = samples.size() / 2;
643 auto const q3 = q1 + q2;
644 // The first call to nth_element is O(n). The 2nd and 3rd calls are O(n / 2).
645 // Basically it's equivalent to running nth_element twice.
646 // One should be tempted to use a faster sorting algorithm like radix sort,
647 // it is not sure that we would gain a lot. There would be no gain in
648 // complexity whatsoever anyway. On top of that, this code is called at most
649 // one per problem, so it's not worth the effort.
650 std::nth_element(samples.begin(), samples.begin() + q2, samples.end());
651 std::nth_element(samples.begin(), samples.begin() + q1, samples.begin() + q2);
652 std::nth_element(samples.begin() + q2, samples.begin() + q3, samples.end());
653 stats.median = samples[samples.size() / 2];
654 stats.iqr = samples[q3] - samples[q1];
655 stats.stddev = StandardDeviation(samples);
656 return stats;
657}
658
659template <typename T>
660std::vector<T> ComputeDeciles(std::vector<T> values) {
661 const int kNumDeciles = 10;
662 std::vector<T> deciles(kNumDeciles, T{0});
663 const double step = values.size() / kNumDeciles;
664 for (int i = 1; i < kNumDeciles; ++i) {
665 const size_t point = std::clamp<double>(i * step, 0, values.size() - 1);
666 std::nth_element(values.begin(), values.begin() + point, values.end());
667 deciles[i] = values[point];
668 }
669 return deciles;
670}
671
673 std::vector<Cost> subset_costs(num_subsets());
674 std::copy(subset_costs_.begin(), subset_costs_.end(), subset_costs.begin());
675 return ComputeStats(std::move(subset_costs));
676}
677
679 std::vector<int64_t> row_sizes(num_elements(), 0);
680 for (const SparseColumn& column : columns_) {
681 for (const ElementIndex element : column) {
682 ++row_sizes[element.value()];
683 }
684 }
685 return ComputeStats(std::move(row_sizes));
686}
687
689 std::vector<int64_t> column_sizes(columns_.size());
690 for (const SubsetIndex subset : SubsetRange()) {
691 column_sizes[subset.value()] = columns_[subset].size();
692 }
693 return ComputeStats(std::move(column_sizes));
694}
695
696std::string SetCoverModel::Stats::ToString(absl::string_view sep) const {
697 return absl::StrJoin(std::make_tuple(min, max, median, mean, stddev, iqr),
698 sep);
699}
700
701std::string SetCoverModel::Stats::ToVerboseString(absl::string_view sep) const {
702 return absl::StrJoin(
703 std::make_tuple("min", min, "max", max, "median", median, "mean", mean,
704 "stddev", stddev, "iqr", iqr),
705 sep);
706}
707
708std::vector<int64_t> SetCoverModel::ComputeRowDeciles() const {
709 std::vector<int64_t> row_sizes(num_elements(), 0);
710 for (const SparseColumn& column : columns_) {
711 for (const ElementIndex element : column) {
712 ++row_sizes[element.value()];
713 }
714 }
715 return ComputeDeciles(std::move(row_sizes));
716}
717
718std::vector<int64_t> SetCoverModel::ComputeColumnDeciles() const {
719 std::vector<int64_t> column_sizes(columns_.size());
720 for (const SubsetIndex subset : SubsetRange()) {
721 column_sizes[subset.value()] = columns_[subset].size();
722 }
723 return ComputeDeciles(std::move(column_sizes));
724}
725
726namespace {
727// Returns the number of bytes needed to store x with a base-128 encoding.
728BaseInt Base128SizeInBytes(BaseInt x) {
729 const uint64_t u = x == 0 ? 1 : static_cast<uint64_t>(x);
730 return (std::numeric_limits<uint64_t>::digits - absl::countl_zero(u) + 6) / 7;
731}
732} // namespace
733
735 StatsAccumulator acc;
736 for (const SparseColumn& column : columns_) {
737 int64_t previous = 0;
738 for (const ElementIndex element : column) {
739 const int64_t delta = element.value() - previous;
740 previous = element.value();
741 acc.Register(Base128SizeInBytes(delta));
742 }
743 }
744 return acc.ComputeStats();
745}
746
748 StatsAccumulator acc;
749 for (const SparseRow& row : rows_) {
750 int64_t previous = 0;
751 for (const SubsetIndex subset : row) {
752 const int64_t delta = subset.value() - previous;
753 previous = subset.value();
754 acc.Register(Base128SizeInBytes(delta));
755 }
756 }
757 return acc.ComputeStats();
758}
759} // namespace operations_research
void ImportModelFromProto(const SetCoverProto &message)
Imports the model from a SetCoverProto.
Stats ComputeCostStats() const
Computes basic statistics on costs and returns a Stats structure.
std::string ToString(absl::string_view sep) const
Returns a string representation of the model, using the given separator.
double FillRate() const
Returns the fill rate of the matrix.
static SetCoverModel GenerateRandomModelFrom(const SetCoverModel &seed_model, BaseInt num_elements, BaseInt num_subsets, double row_scale, double column_scale, double cost_scale)
void ResizeNumSubsets(BaseInt num_subsets)
void ReserveNumElementsInSubset(BaseInt num_elements, BaseInt subset)
Reserves num_elements rows in the column indexed by subset.
std::vector< int64_t > ComputeRowDeciles() const
Computes deciles on rows and returns a vector of deciles.
SetCoverModel(const absl::string_view name="SetCoverModel")
Constructs an empty weighted set-covering problem.
std::vector< SubsetIndex > ComputeSliceIndices(int num_partitions)
util_intops::StrongIntRange< ElementIndex > ElementRange() const
void AddElementToLastSubset(BaseInt element)
const SparseRowView & rows() const
Row view of the set covering problem.
std::vector< SparseRowView > CutSparseRowViewInSlices(absl::Span< const SubsetIndex > partition_points)
Stats ComputeRowStats() const
Computes basic statistics on rows and returns a Stats structure.
int64_t num_nonzeros() const
Current number of nonzeros in the matrix.
util_intops::StrongIntRange< SubsetIndex > SubsetRange() const
Access to the ranges of subsets and elements.
std::string ToVerboseString(absl::string_view sep) const
SparseRowView ComputeSparseRowViewSlice(SubsetIndex begin_subset, SubsetIndex end_subset)
void AddElementToSubset(BaseInt element, BaseInt subset)
void SetSubsetCost(BaseInt subset, Cost cost)
const SubsetCostVector & subset_costs() const
Vector of costs for each subset.
SetCoverProto ExportModelAsProto() const
Stats ComputeColumnStats() const
Computes basic statistics on columns and returns a Stats structure.
std::vector< int64_t > ComputeColumnDeciles() const
Computes deciles on columns and returns a vector of deciles.
const SparseColumnView & columns() const
Column view of the set covering problem.
SparseRowView ReduceSparseRowViewSlices(absl::Span< const SparseRowView > row_slices)
value_type * data()
– Pass-through methods to STL vector ----------------------------------—
void push_back(const value_type &val)
iterator insert(const_iterator pos, const value_type &x)
void reserve(size_type n)
In SWIG mode, we don't want anything besides these top-level includes.
SetCoverModel::Stats ComputeStats(std::vector< T > samples)
util_intops::StrongVector< RowEntryIndex, SubsetIndex > SparseRow
Definition base_types.h:62
ClosedInterval::Iterator end(ClosedInterval interval)
util_intops::StrongVector< ElementIndex, BaseInt > ElementToIntVector
Definition base_types.h:64
util_intops::StrongVector< SubsetIndex, bool > SubsetBoolVector
Definition base_types.h:71
void RadixSort(absl::Span< T > values, int num_bits=sizeof(T) *8)
Definition radix_sort.h:270
util_intops::StrongVector< ElementIndex, bool > ElementBoolVector
Definition base_types.h:72
double Cost
Basic non-strict type for cost. The speed penalty for using double is ~2%.
Definition base_types.h:32
std::vector< T > ComputeDeciles(std::vector< T > values)
util_intops::StrongVector< ColumnEntryIndex, ElementIndex > SparseColumn
Definition base_types.h:61
ClosedInterval::Iterator begin(ClosedInterval interval)
util_intops::StrongVector< ElementIndex, SparseRow > SparseRowView
Definition base_types.h:69
trees with all degrees equal w the current value of degrees
std::string ToVerboseString(absl::string_view sep) const
std::string ToString(absl::string_view sep) const
Returns a string representation of the stats, using the given separator.