Directory: | cvmfs/ |
---|---|
File: | cvmfs/ingestion/chunk_detector.cc |
Date: | 2025-06-22 02:36:02 |
Exec | Total | Coverage | |
---|---|---|---|
Lines: | 56 | 56 | 100.0% |
Branches: | 33 | 46 | 71.7% |
Line | Branch | Exec | Source |
---|---|---|---|
1 | /** | ||
2 | * This file is part of the CernVM File System. | ||
3 | */ | ||
4 | |||
5 | |||
6 | #include "chunk_detector.h" | ||
7 | |||
8 | #include <algorithm> | ||
9 | #include <cassert> | ||
10 | #include <limits> | ||
11 | |||
12 | #include "ingestion/item.h" | ||
13 | |||
14 | |||
15 | 2327677 | uint64_t ChunkDetector::FindNextCutMark(BlockItem *block) { | |
16 | 2327677 | const uint64_t result = DoFindNextCutMark(block); | |
17 |
2/2✓ Branch 0 taken 200604 times.
✓ Branch 1 taken 2127071 times.
|
2327675 | if (result == 0) |
18 | 200604 | offset_ += block->size(); | |
19 | 2327675 | return result; | |
20 | } | ||
21 | |||
22 | |||
23 | //------------------------------------------------------------------------------ | ||
24 | |||
25 | |||
26 | 2050080 | uint64_t StaticOffsetDetector::DoFindNextCutMark(BlockItem *buffer) { | |
27 |
1/2✗ Branch 1 not taken.
✓ Branch 2 taken 2050080 times.
|
2050080 | assert(buffer->type() == BlockItem::kBlockData); |
28 | |||
29 | 2050080 | const uint64_t beginning = offset(); | |
30 | 2050080 | const uint64_t end = offset() + buffer->size(); | |
31 | |||
32 | 2050080 | const uint64_t next_cut = last_cut() + chunk_size_; | |
33 |
3/4✓ Branch 0 taken 2050080 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 2048020 times.
✓ Branch 3 taken 2060 times.
|
2050080 | if (next_cut >= beginning && next_cut < end) { |
34 | 2048020 | return DoCut(next_cut); | |
35 | } | ||
36 | |||
37 | 2060 | return NoCut(next_cut); | |
38 | } | ||
39 | |||
40 | |||
41 | //------------------------------------------------------------------------------ | ||
42 | |||
43 | |||
44 | // This defines the center of the interval where the xor32 rolling checksum is | ||
45 | // queried. You should never change this number, since it affects the definition | ||
46 | // of cut marks. | ||
47 | const int32_t Xor32Detector::kMagicNumber = std::numeric_limits<uint32_t>::max() | ||
48 | / 2; | ||
49 | |||
50 | |||
51 | 332567 | Xor32Detector::Xor32Detector(const uint64_t minimal_chunk_size, | |
52 | const uint64_t average_chunk_size, | ||
53 | 332567 | const uint64_t maximal_chunk_size) | |
54 | 332567 | : minimal_chunk_size_(minimal_chunk_size) | |
55 | 332567 | , average_chunk_size_(average_chunk_size) | |
56 | 332567 | , maximal_chunk_size_(maximal_chunk_size) | |
57 | 14 | , threshold_( | |
58 | (average_chunk_size > 0) | ||
59 | 332553 | ? (std::numeric_limits<uint32_t>::max() / average_chunk_size) | |
60 | : 0) | ||
61 | 332567 | , xor32_ptr_(0) | |
62 |
2/2✓ Branch 1 taken 332553 times.
✓ Branch 2 taken 14 times.
|
332567 | , xor32_(0) { |
63 |
3/4✓ Branch 0 taken 332553 times.
✓ Branch 1 taken 14 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 332553 times.
|
332567 | assert((average_chunk_size_ == 0) || (minimal_chunk_size_ > 0)); |
64 |
2/2✓ Branch 0 taken 332553 times.
✓ Branch 1 taken 14 times.
|
332567 | if (minimal_chunk_size_ > 0) { |
65 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 332553 times.
|
332553 | assert(minimal_chunk_size_ >= kXor32Window); |
66 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 332553 times.
|
332553 | assert(minimal_chunk_size_ < average_chunk_size_); |
67 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 332553 times.
|
332553 | assert(average_chunk_size_ < maximal_chunk_size_); |
68 | } | ||
69 | 332567 | } | |
70 | |||
71 | |||
72 | 277597 | uint64_t Xor32Detector::DoFindNextCutMark(BlockItem *buffer) { | |
73 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 277597 times.
|
277597 | assert(minimal_chunk_size_ > 0); |
74 | 277597 | const unsigned char *data = buffer->data(); | |
75 | |||
76 | // Get the offset where the next xor32 computation needs to be continued | ||
77 | // Note: this could be after collecting at least kMinChunkSize bytes in the | ||
78 | // current chunk, or directly at the beginning of the buffer, when a | ||
79 | // cut mark is currently searched | ||
80 | 277589 | const uint64_t global_offset = std::max( | |
81 | 277589 | last_cut() + static_cast<uint64_t>(minimal_chunk_size_ - kXor32Window), | |
82 | 277592 | xor32_ptr_); | |
83 | |||
84 | // Check if the next xor32 computation is taking place in the current buffer | ||
85 |
2/2✓ Branch 2 taken 97821 times.
✓ Branch 3 taken 179782 times.
|
277589 | if (global_offset >= offset() + static_cast<uint64_t>(buffer->size())) { |
86 |
1/2✓ Branch 1 taken 97818 times.
✗ Branch 2 not taken.
|
97821 | return NoCut(global_offset); |
87 | } | ||
88 | |||
89 | // get the byte offset in the current buffer | ||
90 | 179782 | uint64_t internal_offset = global_offset - offset(); | |
91 |
1/2✗ Branch 1 not taken.
✓ Branch 2 taken 179782 times.
|
179782 | assert(internal_offset < static_cast<uint64_t>(buffer->size())); |
92 | |||
93 | // Precompute the xor32 rolling checksum for finding the next cut mark | ||
94 | // Note: this might be skipped, if the precomputation was already performed | ||
95 | // for the current rolling checksum | ||
96 | // (internal_precompute_end will be negative --> loop is not entered) | ||
97 | 179782 | const uint64_t precompute_end = last_cut() + minimal_chunk_size_; | |
98 | 179782 | const int64_t internal_precompute_end = std::min( | |
99 | 179782 | static_cast<int64_t>(precompute_end - offset()), | |
100 | 179782 | static_cast<int64_t>(buffer->size())); | |
101 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 179782 times.
|
179782 | assert(internal_precompute_end - static_cast<int64_t>(internal_offset) |
102 | <= static_cast<int64_t>(kXor32Window)); | ||
103 |
2/2✓ Branch 0 taken 2532540 times.
✓ Branch 1 taken 179782 times.
|
2712322 | for (; static_cast<int64_t>(internal_offset) < internal_precompute_end; |
104 | ++internal_offset) { | ||
105 | 2532540 | xor32(data[internal_offset]); | |
106 | } | ||
107 | |||
108 | // Do the actual computation and try to find a xor32 based cut mark | ||
109 | // Note: this loop is bound either by kMaxChunkSize or by the size of the | ||
110 | // current buffer, thus the computation would continue later | ||
111 | 179782 | const uint64_t internal_max_chunk_size_end = last_cut() + maximal_chunk_size_ | |
112 | 179782 | - offset(); | |
113 | 179780 | const uint64_t internal_compute_end = std::min( | |
114 | 179781 | internal_max_chunk_size_end, static_cast<uint64_t>(buffer->size())); | |
115 |
2/2✓ Branch 0 taken 5877944684 times.
✓ Branch 1 taken 101387 times.
|
5878046071 | for (; internal_offset < internal_compute_end; ++internal_offset) { |
116 | 5877944684 | xor32(data[internal_offset]); | |
117 | |||
118 | // check if we found a cut mark | ||
119 |
2/2✓ Branch 1 taken 78396 times.
✓ Branch 2 taken 5876703287 times.
|
5876600851 | if (CheckThreshold()) { |
120 |
1/2✓ Branch 2 taken 78396 times.
✗ Branch 3 not taken.
|
78396 | return DoCut(internal_offset + offset()); |
121 | } | ||
122 | } | ||
123 | |||
124 | // Check if the loop was exited because we reached kMaxChunkSize and do a | ||
125 | // hard cut in this case. If not, it exited because we ran out of data in this | ||
126 | // buffer --> continue computation with the next buffer | ||
127 |
2/2✓ Branch 0 taken 656 times.
✓ Branch 1 taken 100731 times.
|
101387 | if (internal_offset == internal_max_chunk_size_end) { |
128 |
1/2✓ Branch 2 taken 656 times.
✗ Branch 3 not taken.
|
656 | return DoCut(internal_offset + offset()); |
129 | } else { | ||
130 |
1/2✓ Branch 2 taken 100730 times.
✗ Branch 3 not taken.
|
100731 | return NoCut(internal_offset + offset()); |
131 | } | ||
132 | } | ||
133 |