| Directory: | cvmfs/ |
|---|---|
| File: | cvmfs/garbage_collection/hash_filter.h |
| Date: | 2025-11-09 02:35:23 |
| Exec | Total | Coverage | |
|---|---|---|---|
| Lines: | 24 | 25 | 96.0% |
| Branches: | 8 | 16 | 50.0% |
| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /** | ||
| 2 | * This file is part of the CernVM File System. | ||
| 3 | * | ||
| 4 | * HashFilters are a container classes that get initialized with a number of | ||
| 5 | * hashes. Later they can serve queries for other hashes and decide if they are | ||
| 6 | * contained in the filter or not. | ||
| 7 | */ | ||
| 8 | |||
| 9 | #ifndef CVMFS_GARBAGE_COLLECTION_HASH_FILTER_H_ | ||
| 10 | #define CVMFS_GARBAGE_COLLECTION_HASH_FILTER_H_ | ||
| 11 | |||
| 12 | #include <set> | ||
| 13 | |||
| 14 | #include "crypto/hash.h" | ||
| 15 | #include "smallhash.h" | ||
| 16 | |||
| 17 | /** | ||
| 18 | * Abstract base class of a HashFilter to define the common interface. | ||
| 19 | */ | ||
| 20 | class AbstractHashFilter { | ||
| 21 | public: | ||
| 22 | 3984 | virtual ~AbstractHashFilter() { } | |
| 23 | |||
| 24 | /** | ||
| 25 | * Adds the given hash to the filter | ||
| 26 | * | ||
| 27 | * @param hash the hash to be added to the HashFilter | ||
| 28 | */ | ||
| 29 | virtual void Fill(const shash::Any &hash) = 0; | ||
| 30 | |||
| 31 | /** | ||
| 32 | * Decides if a presented hash is in the filter or not | ||
| 33 | * Depending on the concrete implementation of this method it could be a prob- | ||
| 34 | * abilistic answer. However, implementations should ensure a recall rate of | ||
| 35 | * 100%, say: never produce false negatives. | ||
| 36 | * | ||
| 37 | * @param hash the hash to be queried | ||
| 38 | * @return true if the hash is (probably) contained in the set | ||
| 39 | * false if it is definitely not in the set | ||
| 40 | */ | ||
| 41 | virtual bool Contains(const shash::Any &hash) const = 0; | ||
| 42 | |||
| 43 | /** | ||
| 44 | * Freezes the filter after filling it with all values. This is not necessary | ||
| 45 | * but could be used for certain optimizations depending on the implementation | ||
| 46 | * of the AbstractHashFilter. | ||
| 47 | * Note: After Freeze() has been called, Fill() should fail! | ||
| 48 | */ | ||
| 49 | ✗ | virtual void Freeze() { } | |
| 50 | |||
| 51 | /** | ||
| 52 | * Returns the number of objects already inserted into the filter. | ||
| 53 | * @return number of objects in the filter | ||
| 54 | */ | ||
| 55 | virtual size_t Count() const = 0; | ||
| 56 | }; | ||
| 57 | |||
| 58 | |||
| 59 | //------------------------------------------------------------------------------ | ||
| 60 | |||
| 61 | |||
| 62 | /** | ||
| 63 | * This is a simplistic implementation of AbstractHashFilter mainly used for | ||
| 64 | * testing purposes. It uses an std::set and thus is highly suboptimal. | ||
| 65 | */ | ||
| 66 | class SimpleHashFilter : public AbstractHashFilter { | ||
| 67 | public: | ||
| 68 | 1911 | SimpleHashFilter() : frozen_(false) { } | |
| 69 | |||
| 70 | 7067928 | void Fill(const shash::Any &hash) { | |
| 71 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 7067928 times.
|
7067928 | assert(!frozen_); |
| 72 | 7067928 | hashes_.insert(hash); | |
| 73 | 7067928 | } | |
| 74 | |||
| 75 | 7033201 | bool Contains(const shash::Any &hash) const { | |
| 76 |
1/2✓ Branch 2 taken 7033201 times.
✗ Branch 3 not taken.
|
7033201 | return hashes_.find(hash) != hashes_.end(); |
| 77 | } | ||
| 78 | |||
| 79 | 56 | void Freeze() { frozen_ = true; } | |
| 80 | 105 | size_t Count() const { return hashes_.size(); } | |
| 81 | |||
| 82 | private: | ||
| 83 | std::set<shash::Any> hashes_; | ||
| 84 | bool frozen_; | ||
| 85 | }; | ||
| 86 | |||
| 87 | |||
| 88 | //------------------------------------------------------------------------------ | ||
| 89 | |||
| 90 | |||
| 91 | /** | ||
| 92 | * This is an implementation of AbstractHashFilter using the SmallHash structure | ||
| 93 | * for internal storage. | ||
| 94 | */ | ||
| 95 | class SmallhashFilter : public AbstractHashFilter { | ||
| 96 | protected: | ||
| 97 | 18001611 | static uint32_t hasher(const shash::Any &key) { | |
| 98 | // Don't start with the first bytes, because == is using them as well | ||
| 99 | return static_cast<uint32_t>( | ||
| 100 | 18001611 | *(reinterpret_cast<const uint32_t *>(key.digest) + 1)); | |
| 101 | } | ||
| 102 | |||
| 103 | public: | ||
| 104 |
1/2✓ Branch 2 taken 81 times.
✗ Branch 3 not taken.
|
81 | SmallhashFilter() : frozen_(false) { |
| 105 | // zero_element is MD5("unobtanium") | ||
| 106 | const shash::Any zero_element( | ||
| 107 |
2/4✓ Branch 2 taken 81 times.
✗ Branch 3 not taken.
✓ Branch 6 taken 81 times.
✗ Branch 7 not taken.
|
81 | shash::kMd5, shash::HexPtr("d61f853acc5a39e01f3906f73e31d256")); |
| 108 |
1/2✓ Branch 1 taken 81 times.
✗ Branch 2 not taken.
|
81 | hashmap_.Init(1048576, zero_element, &SmallhashFilter::hasher); |
| 109 | 81 | } | |
| 110 | |||
| 111 | 9000558 | void Fill(const shash::Any &hash) { | |
| 112 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 9000558 times.
|
9000558 | assert(!frozen_); |
| 113 |
1/2✓ Branch 1 taken 9000558 times.
✗ Branch 2 not taken.
|
9000558 | hashmap_.Insert(hash, true); |
| 114 | 9000558 | } | |
| 115 | |||
| 116 | 9001053 | bool Contains(const shash::Any &hash) const { | |
| 117 | 9001053 | return hashmap_.Contains(hash); | |
| 118 | } | ||
| 119 | |||
| 120 | 72 | void Freeze() { frozen_ = true; } | |
| 121 | 135 | size_t Count() const { return hashmap_.size(); } | |
| 122 | |||
| 123 | private: | ||
| 124 | SmallHashDynamic<shash::Any, bool> hashmap_; | ||
| 125 | bool frozen_; | ||
| 126 | }; | ||
| 127 | |||
| 128 | #endif // CVMFS_GARBAGE_COLLECTION_HASH_FILTER_H_ | ||
| 129 |