1 |
|
|
/** |
2 |
|
|
* This file is part of the CernVM File System. |
3 |
|
|
*/ |
4 |
|
|
|
5 |
|
|
#include "cvmfs_config.h" |
6 |
|
|
#include "chunk_detector.h" |
7 |
|
|
|
8 |
|
|
#include <algorithm> |
9 |
|
|
#include <cassert> |
10 |
|
|
#include <limits> |
11 |
|
|
|
12 |
|
|
#include "ingestion/item.h" |
13 |
|
|
|
14 |
|
|
|
15 |
|
230660 |
uint64_t ChunkDetector::FindNextCutMark(BlockItem *block) { |
16 |
|
230660 |
uint64_t result = DoFindNextCutMark(block); |
17 |
✓✓ |
230660 |
if (result == 0) |
18 |
|
121234 |
offset_ += block->size(); |
19 |
|
230660 |
return result; |
20 |
|
|
} |
21 |
|
|
|
22 |
|
|
|
23 |
|
|
//------------------------------------------------------------------------------ |
24 |
|
|
|
25 |
|
|
|
26 |
|
102504 |
uint64_t StaticOffsetDetector::DoFindNextCutMark(BlockItem *buffer) { |
27 |
✗✓ |
102504 |
assert(buffer->type() == BlockItem::kBlockData); |
28 |
|
|
|
29 |
|
102504 |
const uint64_t beginning = offset(); |
30 |
|
102504 |
const uint64_t end = offset() + buffer->size(); |
31 |
|
|
|
32 |
|
102504 |
const uint64_t next_cut = last_cut() + chunk_size_; |
33 |
✓✗✓✓
|
102504 |
if (next_cut >= beginning && next_cut < end) { |
34 |
|
102401 |
return DoCut(next_cut); |
35 |
|
|
} |
36 |
|
|
|
37 |
|
103 |
return NoCut(next_cut); |
38 |
|
|
} |
39 |
|
|
|
40 |
|
|
|
41 |
|
|
//------------------------------------------------------------------------------ |
42 |
|
|
|
43 |
|
|
|
44 |
|
|
|
45 |
|
|
// This defines the center of the interval where the xor32 rolling checksum is |
46 |
|
|
// queried. You should never change this number, since it affects the definition |
47 |
|
|
// of cut marks. |
48 |
|
15 |
const int32_t Xor32Detector::kMagicNumber = |
49 |
|
|
std::numeric_limits<uint32_t>::max() / 2; |
50 |
|
|
|
51 |
|
|
|
52 |
|
965 |
Xor32Detector::Xor32Detector(const uint64_t minimal_chunk_size, |
53 |
|
|
const uint64_t average_chunk_size, |
54 |
|
|
const uint64_t maximal_chunk_size) |
55 |
|
|
: minimal_chunk_size_(minimal_chunk_size) |
56 |
|
|
, average_chunk_size_(average_chunk_size) |
57 |
|
|
, maximal_chunk_size_(maximal_chunk_size) |
58 |
|
|
, threshold_((average_chunk_size > 0) |
59 |
|
|
? (std::numeric_limits<uint32_t>::max() / average_chunk_size) |
60 |
|
|
: 0) |
61 |
|
|
, xor32_ptr_(0) |
62 |
✓✓ |
965 |
, xor32_(0) |
63 |
|
|
{ |
64 |
✓✓✗✓
|
965 |
assert((average_chunk_size_ == 0) || (minimal_chunk_size_ > 0)); |
65 |
✓✓ |
965 |
if (minimal_chunk_size_ > 0) { |
66 |
✗✓ |
964 |
assert(minimal_chunk_size_ >= kXor32Window); |
67 |
✗✓ |
964 |
assert(minimal_chunk_size_ < average_chunk_size_); |
68 |
✗✓ |
964 |
assert(average_chunk_size_ < maximal_chunk_size_); |
69 |
|
|
} |
70 |
|
|
} |
71 |
|
|
|
72 |
|
|
|
73 |
|
128156 |
uint64_t Xor32Detector::DoFindNextCutMark(BlockItem *buffer) { |
74 |
✗✓ |
128156 |
assert(minimal_chunk_size_ > 0); |
75 |
|
128156 |
const unsigned char *data = buffer->data(); |
76 |
|
|
|
77 |
|
|
// Get the offset where the next xor32 computation needs to be continued |
78 |
|
|
// Note: this could be after collecting at least kMinChunkSize bytes in the |
79 |
|
|
// current chunk, or directly at the beginning of the buffer, when a |
80 |
|
|
// cut mark is currently searched |
81 |
|
|
const uint64_t global_offset = |
82 |
|
|
std::max( |
83 |
|
|
last_cut() + |
84 |
|
|
static_cast<uint64_t>(minimal_chunk_size_ - kXor32Window), |
85 |
|
128156 |
xor32_ptr_); |
86 |
|
|
|
87 |
|
|
// Check if the next xor32 computation is taking place in the current buffer |
88 |
✓✓ |
128156 |
if (global_offset >= offset() + static_cast<uint64_t>(buffer->size())) { |
89 |
|
56905 |
return NoCut(global_offset); |
90 |
|
|
} |
91 |
|
|
|
92 |
|
|
// get the byte offset in the current buffer |
93 |
|
71251 |
uint64_t internal_offset = global_offset - offset(); |
94 |
✗✓ |
71251 |
assert(internal_offset < static_cast<uint64_t>(buffer->size())); |
95 |
|
|
|
96 |
|
|
// Precompute the xor32 rolling checksum for finding the next cut mark |
97 |
|
|
// Note: this might be skipped, if the precomputation was already performed |
98 |
|
|
// for the current rolling checksum |
99 |
|
|
// (internal_precompute_end will be negative --> loop is not entered) |
100 |
|
71251 |
const uint64_t precompute_end = last_cut() + minimal_chunk_size_; |
101 |
|
|
const int64_t internal_precompute_end = |
102 |
|
|
std::min(static_cast<int64_t>(precompute_end - offset()), |
103 |
|
71251 |
static_cast<int64_t>(buffer->size())); |
104 |
|
|
assert(internal_precompute_end - static_cast<int64_t>(internal_offset) <= |
105 |
✗✓ |
71251 |
static_cast<int64_t>(kXor32Window)); |
106 |
✓✓ |
297333 |
for (; static_cast<int64_t>(internal_offset) < internal_precompute_end; |
107 |
|
|
++internal_offset) |
108 |
|
|
{ |
109 |
|
226082 |
xor32(data[internal_offset]); |
110 |
|
|
} |
111 |
|
|
|
112 |
|
|
// Do the actual computation and try to find a xor32 based cut mark |
113 |
|
|
// Note: this loop is bound either by kMaxChunkSize or by the size of the |
114 |
|
|
// current buffer, thus the computation would continue later |
115 |
|
|
const uint64_t internal_max_chunk_size_end = |
116 |
|
71251 |
last_cut() + maximal_chunk_size_ - offset(); |
117 |
|
|
const uint64_t internal_compute_end = |
118 |
|
|
std::min(internal_max_chunk_size_end, |
119 |
|
71251 |
static_cast<uint64_t>(buffer->size())); |
120 |
✓✓ |
1242705521 |
for (; internal_offset < internal_compute_end; ++internal_offset) { |
121 |
|
1242640993 |
xor32(data[internal_offset]); |
122 |
|
|
|
123 |
|
|
// check if we found a cut mark |
124 |
✓✓ |
1242640993 |
if (CheckThreshold()) { |
125 |
|
6723 |
return DoCut(internal_offset + offset()); |
126 |
|
|
} |
127 |
|
|
} |
128 |
|
|
|
129 |
|
|
// Check if the loop was exited because we reached kMaxChunkSize and do a |
130 |
|
|
// hard cut in this case. If not, it exited because we ran out of data in this |
131 |
|
|
// buffer --> continue computation with the next buffer |
132 |
✓✓ |
64528 |
if (internal_offset == internal_max_chunk_size_end) { |
133 |
|
302 |
return DoCut(internal_offset + offset()); |
134 |
|
|
} else { |
135 |
|
64226 |
return NoCut(internal_offset + offset()); |
136 |
|
|
} |
137 |
✓✗✓✗
|
45 |
} |