CernVM-FS  2.12.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
chunk_detector.cc
Go to the documentation of this file.
1 
5 #include "cvmfs_config.h"
6 #include "chunk_detector.h"
7 
8 #include <algorithm>
9 #include <cassert>
10 #include <limits>
11 
12 #include "ingestion/item.h"
13 
14 
16  uint64_t result = DoFindNextCutMark(block);
17  if (result == 0)
18  offset_ += block->size();
19  return result;
20 }
21 
22 
23 //------------------------------------------------------------------------------
24 
25 
27  assert(buffer->type() == BlockItem::kBlockData);
28 
29  const uint64_t beginning = offset();
30  const uint64_t end = offset() + buffer->size();
31 
32  const uint64_t next_cut = last_cut() + chunk_size_;
33  if (next_cut >= beginning && next_cut < end) {
34  return DoCut(next_cut);
35  }
36 
37  return NoCut(next_cut);
38 }
39 
40 
41 //------------------------------------------------------------------------------
42 
43 
44 
45 // This defines the center of the interval where the xor32 rolling checksum is
46 // queried. You should never change this number, since it affects the definition
47 // of cut marks.
48 const int32_t Xor32Detector::kMagicNumber =
49  std::numeric_limits<uint32_t>::max() / 2;
50 
51 
52 Xor32Detector::Xor32Detector(const uint64_t minimal_chunk_size,
53  const uint64_t average_chunk_size,
54  const uint64_t maximal_chunk_size)
55  : minimal_chunk_size_(minimal_chunk_size)
56  , average_chunk_size_(average_chunk_size)
57  , maximal_chunk_size_(maximal_chunk_size)
58  , threshold_((average_chunk_size > 0)
59  ? (std::numeric_limits<uint32_t>::max() / average_chunk_size)
60  : 0)
61  , xor32_ptr_(0)
62  , xor32_(0)
63 {
65  if (minimal_chunk_size_ > 0) {
69  }
70 }
71 
72 
75  const unsigned char *data = buffer->data();
76 
77  // Get the offset where the next xor32 computation needs to be continued
78  // Note: this could be after collecting at least kMinChunkSize bytes in the
79  // current chunk, or directly at the beginning of the buffer, when a
80  // cut mark is currently searched
81  const uint64_t global_offset =
82  std::max(
83  last_cut() +
84  static_cast<uint64_t>(minimal_chunk_size_ - kXor32Window),
85  xor32_ptr_);
86 
87  // Check if the next xor32 computation is taking place in the current buffer
88  if (global_offset >= offset() + static_cast<uint64_t>(buffer->size())) {
89  return NoCut(global_offset);
90  }
91 
92  // get the byte offset in the current buffer
93  uint64_t internal_offset = global_offset - offset();
94  assert(internal_offset < static_cast<uint64_t>(buffer->size()));
95 
96  // Precompute the xor32 rolling checksum for finding the next cut mark
97  // Note: this might be skipped, if the precomputation was already performed
98  // for the current rolling checksum
99  // (internal_precompute_end will be negative --> loop is not entered)
100  const uint64_t precompute_end = last_cut() + minimal_chunk_size_;
101  const int64_t internal_precompute_end =
102  std::min(static_cast<int64_t>(precompute_end - offset()),
103  static_cast<int64_t>(buffer->size()));
104  assert(internal_precompute_end - static_cast<int64_t>(internal_offset) <=
105  static_cast<int64_t>(kXor32Window));
106  for (; static_cast<int64_t>(internal_offset) < internal_precompute_end;
107  ++internal_offset)
108  {
109  xor32(data[internal_offset]);
110  }
111 
112  // Do the actual computation and try to find a xor32 based cut mark
113  // Note: this loop is bound either by kMaxChunkSize or by the size of the
114  // current buffer, thus the computation would continue later
115  const uint64_t internal_max_chunk_size_end =
117  const uint64_t internal_compute_end =
118  std::min(internal_max_chunk_size_end,
119  static_cast<uint64_t>(buffer->size()));
120  for (; internal_offset < internal_compute_end; ++internal_offset) {
121  xor32(data[internal_offset]);
122 
123  // check if we found a cut mark
124  if (CheckThreshold()) {
125  return DoCut(internal_offset + offset());
126  }
127  }
128 
129  // Check if the loop was exited because we reached kMaxChunkSize and do a
130  // hard cut in this case. If not, it exited because we ran out of data in this
131  // buffer --> continue computation with the next buffer
132  if (internal_offset == internal_max_chunk_size_end) {
133  return DoCut(internal_offset + offset());
134  } else {
135  return NoCut(internal_offset + offset());
136  }
137 }
virtual uint64_t DoFindNextCutMark(BlockItem *buffer)
void xor32(const unsigned char byte)
const uint64_t chunk_size_
virtual uint64_t DoCut(const uint64_t offset)
const uint64_t maximal_chunk_size_
uint64_t offset_
static const int32_t kMagicNumber
virtual uint64_t NoCut(const uint64_t offset)
assert((mem||(size==0))&&"Out Of Memory")
unsigned char * data()
Definition: item.h:213
static const unsigned kXor32Window
virtual uint64_t DoFindNextCutMark(BlockItem *buffer)
uint64_t xor32_ptr_
const uint64_t minimal_chunk_size_
uint32_t size()
Definition: item.h:215
uint64_t FindNextCutMark(BlockItem *block)
virtual uint64_t NoCut(uint64_t)
Xor32Detector(const uint64_t minimal_chunk_size, const uint64_t average_chunk_size, const uint64_t maximal_chunk_size)
uint64_t offset() const
BlockType type()
Definition: item.h:218
virtual uint64_t DoCut(uint64_t offset)
bool CheckThreshold()
uint64_t last_cut() const
const uint64_t average_chunk_size_
virtual uint64_t DoFindNextCutMark(BlockItem *block)=0