Directory: | cvmfs/ |
---|---|
File: | cvmfs/ingestion/chunk_detector.cc |
Date: | 2025-04-20 02:34:28 |
Exec | Total | Coverage | |
---|---|---|---|
Lines: | 55 | 55 | 100.0% |
Branches: | 33 | 46 | 71.7% |
Line | Branch | Exec | Source |
---|---|---|---|
1 | /** | ||
2 | * This file is part of the CernVM File System. | ||
3 | */ | ||
4 | |||
5 | |||
6 | #include "chunk_detector.h" | ||
7 | |||
8 | #include <algorithm> | ||
9 | #include <cassert> | ||
10 | #include <limits> | ||
11 | |||
12 | #include "ingestion/item.h" | ||
13 | |||
14 | |||
15 | 147282 | uint64_t ChunkDetector::FindNextCutMark(BlockItem *block) { | |
16 | 147282 | uint64_t result = DoFindNextCutMark(block); | |
17 |
2/2✓ Branch 0 taken 39050 times.
✓ Branch 1 taken 108233 times.
|
147283 | if (result == 0) |
18 | 39050 | offset_ += block->size(); | |
19 | 147279 | return result; | |
20 | } | ||
21 | |||
22 | |||
23 | //------------------------------------------------------------------------------ | ||
24 | |||
25 | |||
26 | 102504 | uint64_t StaticOffsetDetector::DoFindNextCutMark(BlockItem *buffer) { | |
27 |
1/2✗ Branch 1 not taken.
✓ Branch 2 taken 102504 times.
|
102504 | assert(buffer->type() == BlockItem::kBlockData); |
28 | |||
29 | 102504 | const uint64_t beginning = offset(); | |
30 | 102504 | const uint64_t end = offset() + buffer->size(); | |
31 | |||
32 | 102504 | const uint64_t next_cut = last_cut() + chunk_size_; | |
33 |
3/4✓ Branch 0 taken 102504 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 102401 times.
✓ Branch 3 taken 103 times.
|
102504 | if (next_cut >= beginning && next_cut < end) { |
34 | 102401 | return DoCut(next_cut); | |
35 | } | ||
36 | |||
37 | 103 | return NoCut(next_cut); | |
38 | } | ||
39 | |||
40 | |||
41 | //------------------------------------------------------------------------------ | ||
42 | |||
43 | |||
44 | |||
45 | // This defines the center of the interval where the xor32 rolling checksum is | ||
46 | // queried. You should never change this number, since it affects the definition | ||
47 | // of cut marks. | ||
48 | const int32_t Xor32Detector::kMagicNumber = | ||
49 | std::numeric_limits<uint32_t>::max() / 2; | ||
50 | |||
51 | |||
52 | 253418 | Xor32Detector::Xor32Detector(const uint64_t minimal_chunk_size, | |
53 | const uint64_t average_chunk_size, | ||
54 | 253418 | const uint64_t maximal_chunk_size) | |
55 | 253418 | : minimal_chunk_size_(minimal_chunk_size) | |
56 | 253418 | , average_chunk_size_(average_chunk_size) | |
57 | 253418 | , maximal_chunk_size_(maximal_chunk_size) | |
58 | 1 | , threshold_((average_chunk_size > 0) | |
59 | 253417 | ? (std::numeric_limits<uint32_t>::max() / average_chunk_size) | |
60 | : 0) | ||
61 | 253418 | , xor32_ptr_(0) | |
62 |
2/2✓ Branch 1 taken 253417 times.
✓ Branch 2 taken 1 times.
|
253418 | , xor32_(0) |
63 | { | ||
64 |
3/4✓ Branch 0 taken 253417 times.
✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 253417 times.
|
253418 | assert((average_chunk_size_ == 0) || (minimal_chunk_size_ > 0)); |
65 |
2/2✓ Branch 0 taken 253417 times.
✓ Branch 1 taken 1 times.
|
253418 | if (minimal_chunk_size_ > 0) { |
66 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 253417 times.
|
253417 | assert(minimal_chunk_size_ >= kXor32Window); |
67 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 253417 times.
|
253417 | assert(minimal_chunk_size_ < average_chunk_size_); |
68 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 253417 times.
|
253417 | assert(average_chunk_size_ < maximal_chunk_size_); |
69 | } | ||
70 | 253418 | } | |
71 | |||
72 | |||
73 | 44775 | uint64_t Xor32Detector::DoFindNextCutMark(BlockItem *buffer) { | |
74 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 44775 times.
|
44775 | assert(minimal_chunk_size_ > 0); |
75 | 44775 | const unsigned char *data = buffer->data(); | |
76 | |||
77 | // Get the offset where the next xor32 computation needs to be continued | ||
78 | // Note: this could be after collecting at least kMinChunkSize bytes in the | ||
79 | // current chunk, or directly at the beginning of the buffer, when a | ||
80 | // cut mark is currently searched | ||
81 | const uint64_t global_offset = | ||
82 | 44774 | std::max( | |
83 | 134326 | last_cut() + | |
84 | 44774 | static_cast<uint64_t>(minimal_chunk_size_ - kXor32Window), | |
85 | 44774 | xor32_ptr_); | |
86 | |||
87 | // Check if the next xor32 computation is taking place in the current buffer | ||
88 |
2/2✓ Branch 2 taken 18539 times.
✓ Branch 3 taken 26244 times.
|
44778 | if (global_offset >= offset() + static_cast<uint64_t>(buffer->size())) { |
89 |
1/2✓ Branch 1 taken 18538 times.
✗ Branch 2 not taken.
|
18539 | return NoCut(global_offset); |
90 | } | ||
91 | |||
92 | // get the byte offset in the current buffer | ||
93 | 26244 | uint64_t internal_offset = global_offset - offset(); | |
94 |
1/2✗ Branch 1 not taken.
✓ Branch 2 taken 26242 times.
|
26242 | assert(internal_offset < static_cast<uint64_t>(buffer->size())); |
95 | |||
96 | // Precompute the xor32 rolling checksum for finding the next cut mark | ||
97 | // Note: this might be skipped, if the precomputation was already performed | ||
98 | // for the current rolling checksum | ||
99 | // (internal_precompute_end will be negative --> loop is not entered) | ||
100 | 26242 | const uint64_t precompute_end = last_cut() + minimal_chunk_size_; | |
101 | const int64_t internal_precompute_end = | ||
102 | 26242 | std::min(static_cast<int64_t>(precompute_end - offset()), | |
103 | 26242 | static_cast<int64_t>(buffer->size())); | |
104 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 26243 times.
|
26243 | assert(internal_precompute_end - static_cast<int64_t>(internal_offset) <= |
105 | static_cast<int64_t>(kXor32Window)); | ||
106 |
2/2✓ Branch 0 taken 187010 times.
✓ Branch 1 taken 26243 times.
|
213253 | for (; static_cast<int64_t>(internal_offset) < internal_precompute_end; |
107 | ++internal_offset) | ||
108 | { | ||
109 | 187010 | xor32(data[internal_offset]); | |
110 | } | ||
111 | |||
112 | // Do the actual computation and try to find a xor32 based cut mark | ||
113 | // Note: this loop is bound either by kMaxChunkSize or by the size of the | ||
114 | // current buffer, thus the computation would continue later | ||
115 | const uint64_t internal_max_chunk_size_end = | ||
116 | 26243 | last_cut() + maximal_chunk_size_ - offset(); | |
117 | const uint64_t internal_compute_end = | ||
118 | 26242 | std::min(internal_max_chunk_size_end, | |
119 | 26243 | static_cast<uint64_t>(buffer->size())); | |
120 |
2/2✓ Branch 0 taken 493632023 times.
✓ Branch 1 taken 20512 times.
|
493652535 | for (; internal_offset < internal_compute_end; ++internal_offset) { |
121 | 493632023 | xor32(data[internal_offset]); | |
122 | |||
123 | // check if we found a cut mark | ||
124 |
2/2✓ Branch 1 taken 5733 times.
✓ Branch 2 taken 492814311 times.
|
492773421 | if (CheckThreshold()) { |
125 |
1/2✓ Branch 2 taken 5733 times.
✗ Branch 3 not taken.
|
5733 | return DoCut(internal_offset + offset()); |
126 | } | ||
127 | } | ||
128 | |||
129 | // Check if the loop was exited because we reached kMaxChunkSize and do a | ||
130 | // hard cut in this case. If not, it exited because we ran out of data in this | ||
131 | // buffer --> continue computation with the next buffer | ||
132 |
2/2✓ Branch 0 taken 98 times.
✓ Branch 1 taken 20414 times.
|
20512 | if (internal_offset == internal_max_chunk_size_end) { |
133 |
1/2✓ Branch 2 taken 98 times.
✗ Branch 3 not taken.
|
98 | return DoCut(internal_offset + offset()); |
134 | } else { | ||
135 |
1/2✓ Branch 2 taken 20413 times.
✗ Branch 3 not taken.
|
20414 | return NoCut(internal_offset + offset()); |
136 | } | ||
137 | } | ||
138 |