| Directory: | cvmfs/ |
|---|---|
| File: | cvmfs/ingestion/chunk_detector.cc |
| Date: | 2025-11-16 02:35:16 |
| Exec | Total | Coverage | |
|---|---|---|---|
| Lines: | 56 | 56 | 100.0% |
| Branches: | 33 | 46 | 71.7% |
| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /** | ||
| 2 | * This file is part of the CernVM File System. | ||
| 3 | */ | ||
| 4 | |||
| 5 | |||
| 6 | #include "chunk_detector.h" | ||
| 7 | |||
| 8 | #include <algorithm> | ||
| 9 | #include <cassert> | ||
| 10 | #include <limits> | ||
| 11 | |||
| 12 | #include "ingestion/item.h" | ||
| 13 | |||
| 14 | |||
| 15 | 2102824 | uint64_t ChunkDetector::FindNextCutMark(BlockItem *block) { | |
| 16 | 2102824 | const uint64_t result = DoFindNextCutMark(block); | |
| 17 |
2/2✓ Branch 0 taken 1543639 times.
✓ Branch 1 taken 559068 times.
|
2102707 | if (result == 0) |
| 18 | 1543639 | offset_ += block->size(); | |
| 19 | 2102629 | return result; | |
| 20 | } | ||
| 21 | |||
| 22 | |||
| 23 | //------------------------------------------------------------------------------ | ||
| 24 | |||
| 25 | |||
| 26 | 307512 | uint64_t StaticOffsetDetector::DoFindNextCutMark(BlockItem *buffer) { | |
| 27 |
1/2✗ Branch 1 not taken.
✓ Branch 2 taken 307512 times.
|
307512 | assert(buffer->type() == BlockItem::kBlockData); |
| 28 | |||
| 29 | 307512 | const uint64_t beginning = offset(); | |
| 30 | 307512 | const uint64_t end = offset() + buffer->size(); | |
| 31 | |||
| 32 | 307512 | const uint64_t next_cut = last_cut() + chunk_size_; | |
| 33 |
3/4✓ Branch 0 taken 307512 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 307203 times.
✓ Branch 3 taken 309 times.
|
307512 | if (next_cut >= beginning && next_cut < end) { |
| 34 | 307203 | return DoCut(next_cut); | |
| 35 | } | ||
| 36 | |||
| 37 | 309 | return NoCut(next_cut); | |
| 38 | } | ||
| 39 | |||
| 40 | |||
| 41 | //------------------------------------------------------------------------------ | ||
| 42 | |||
| 43 | |||
| 44 | // This defines the center of the interval where the xor32 rolling checksum is | ||
| 45 | // queried. You should never change this number, since it affects the definition | ||
| 46 | // of cut marks. | ||
| 47 | const int32_t Xor32Detector::kMagicNumber = std::numeric_limits<uint32_t>::max() | ||
| 48 | / 2; | ||
| 49 | |||
| 50 | |||
| 51 | 9874009 | Xor32Detector::Xor32Detector(const uint64_t minimal_chunk_size, | |
| 52 | const uint64_t average_chunk_size, | ||
| 53 | 9874009 | const uint64_t maximal_chunk_size) | |
| 54 | 9874009 | : minimal_chunk_size_(minimal_chunk_size) | |
| 55 | 9874009 | , average_chunk_size_(average_chunk_size) | |
| 56 | 9874009 | , maximal_chunk_size_(maximal_chunk_size) | |
| 57 | 47 | , threshold_( | |
| 58 | (average_chunk_size > 0) | ||
| 59 | 9873962 | ? (std::numeric_limits<uint32_t>::max() / average_chunk_size) | |
| 60 | : 0) | ||
| 61 | 9874009 | , xor32_ptr_(0) | |
| 62 |
2/2✓ Branch 1 taken 9873962 times.
✓ Branch 2 taken 47 times.
|
9874009 | , xor32_(0) { |
| 63 |
3/4✓ Branch 0 taken 9873962 times.
✓ Branch 1 taken 47 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 9873962 times.
|
9874009 | assert((average_chunk_size_ == 0) || (minimal_chunk_size_ > 0)); |
| 64 |
2/2✓ Branch 0 taken 9873962 times.
✓ Branch 1 taken 47 times.
|
9874009 | if (minimal_chunk_size_ > 0) { |
| 65 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 9873962 times.
|
9873962 | assert(minimal_chunk_size_ >= kXor32Window); |
| 66 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 9873962 times.
|
9873962 | assert(minimal_chunk_size_ < average_chunk_size_); |
| 67 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 9873962 times.
|
9873962 | assert(average_chunk_size_ < maximal_chunk_size_); |
| 68 | } | ||
| 69 | 9874009 | } | |
| 70 | |||
| 71 | |||
| 72 | 1795273 | uint64_t Xor32Detector::DoFindNextCutMark(BlockItem *buffer) { | |
| 73 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1795273 times.
|
1795273 | assert(minimal_chunk_size_ > 0); |
| 74 | 1795273 | const unsigned char *data = buffer->data(); | |
| 75 | |||
| 76 | // Get the offset where the next xor32 computation needs to be continued | ||
| 77 | // Note: this could be after collecting at least kMinChunkSize bytes in the | ||
| 78 | // current chunk, or directly at the beginning of the buffer, when a | ||
| 79 | // cut mark is currently searched | ||
| 80 | 1795234 | const uint64_t global_offset = std::max( | |
| 81 | 1795234 | last_cut() + static_cast<uint64_t>(minimal_chunk_size_ - kXor32Window), | |
| 82 | 1795273 | xor32_ptr_); | |
| 83 | |||
| 84 | // Check if the next xor32 computation is taking place in the current buffer | ||
| 85 |
2/2✓ Branch 2 taken 736089 times.
✓ Branch 3 taken 1059262 times.
|
1795312 | if (global_offset >= offset() + static_cast<uint64_t>(buffer->size())) { |
| 86 |
1/2✓ Branch 1 taken 736011 times.
✗ Branch 2 not taken.
|
736089 | return NoCut(global_offset); |
| 87 | } | ||
| 88 | |||
| 89 | // get the byte offset in the current buffer | ||
| 90 | 1059262 | uint64_t internal_offset = global_offset - offset(); | |
| 91 |
1/2✗ Branch 1 not taken.
✓ Branch 2 taken 1059262 times.
|
1059262 | assert(internal_offset < static_cast<uint64_t>(buffer->size())); |
| 92 | |||
| 93 | // Precompute the xor32 rolling checksum for finding the next cut mark | ||
| 94 | // Note: this might be skipped, if the precomputation was already performed | ||
| 95 | // for the current rolling checksum | ||
| 96 | // (internal_precompute_end will be negative --> loop is not entered) | ||
| 97 | 1059262 | const uint64_t precompute_end = last_cut() + minimal_chunk_size_; | |
| 98 | 1059262 | const int64_t internal_precompute_end = std::min( | |
| 99 | 1059262 | static_cast<int64_t>(precompute_end - offset()), | |
| 100 | 1059262 | static_cast<int64_t>(buffer->size())); | |
| 101 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1059262 times.
|
1059262 | assert(internal_precompute_end - static_cast<int64_t>(internal_offset) |
| 102 | <= static_cast<int64_t>(kXor32Window)); | ||
| 103 |
2/2✓ Branch 0 taken 8071390 times.
✓ Branch 1 taken 1059262 times.
|
9130652 | for (; static_cast<int64_t>(internal_offset) < internal_precompute_end; |
| 104 | ++internal_offset) { | ||
| 105 | 8071390 | xor32(data[internal_offset]); | |
| 106 | } | ||
| 107 | |||
| 108 | // Do the actual computation and try to find a xor32 based cut mark | ||
| 109 | // Note: this loop is bound either by kMaxChunkSize or by the size of the | ||
| 110 | // current buffer, thus the computation would continue later | ||
| 111 | 1059262 | const uint64_t internal_max_chunk_size_end = last_cut() + maximal_chunk_size_ | |
| 112 | 1059262 | - offset(); | |
| 113 | 1059223 | const uint64_t internal_compute_end = std::min( | |
| 114 | 1059262 | internal_max_chunk_size_end, static_cast<uint64_t>(buffer->size())); | |
| 115 |
2/2✓ Branch 0 taken 9346578203 times.
✓ Branch 1 taken 810344 times.
|
9347388547 | for (; internal_offset < internal_compute_end; ++internal_offset) { |
| 116 | 9346578203 | xor32(data[internal_offset]); | |
| 117 | |||
| 118 | // check if we found a cut mark | ||
| 119 |
2/2✓ Branch 1 taken 249035 times.
✓ Branch 2 taken 9307448274 times.
|
9323263145 | if (CheckThreshold()) { |
| 120 |
1/2✓ Branch 2 taken 249035 times.
✗ Branch 3 not taken.
|
249035 | return DoCut(internal_offset + offset()); |
| 121 | } | ||
| 122 | } | ||
| 123 | |||
| 124 | // Check if the loop was exited because we reached kMaxChunkSize and do a | ||
| 125 | // hard cut in this case. If not, it exited because we ran out of data in this | ||
| 126 | // buffer --> continue computation with the next buffer | ||
| 127 |
2/2✓ Branch 0 taken 2830 times.
✓ Branch 1 taken 807514 times.
|
810344 | if (internal_offset == internal_max_chunk_size_end) { |
| 128 |
1/2✓ Branch 2 taken 2830 times.
✗ Branch 3 not taken.
|
2830 | return DoCut(internal_offset + offset()); |
| 129 | } else { | ||
| 130 |
1/2✓ Branch 2 taken 807514 times.
✗ Branch 3 not taken.
|
807514 | return NoCut(internal_offset + offset()); |
| 131 | } | ||
| 132 | } | ||
| 133 |