Line |
Branch |
Exec |
Source |
1 |
|
|
/** |
2 |
|
|
* This file is part of the CernVM File System. |
3 |
|
|
*/ |
4 |
|
|
|
5 |
|
|
#ifndef CVMFS_INGESTION_CHUNK_DETECTOR_H_ |
6 |
|
|
#define CVMFS_INGESTION_CHUNK_DETECTOR_H_ |
7 |
|
|
|
8 |
|
|
#include <gtest/gtest_prod.h> |
9 |
|
|
#include <stdint.h> |
10 |
|
|
|
11 |
|
|
#include <cstdlib> |
12 |
|
|
|
13 |
|
|
#include <algorithm> |
14 |
|
|
|
15 |
|
|
class BlockItem; |
16 |
|
|
|
17 |
|
|
/** |
18 |
|
|
* Abstract base class for a cutmark detector. This decides on which file |
19 |
|
|
* positions a File should be chunked. |
20 |
|
|
*/ |
21 |
|
|
class ChunkDetector { |
22 |
|
|
public: |
23 |
|
253419 |
ChunkDetector() : last_cut_(0), offset_(0) {} |
24 |
|
506606 |
virtual ~ChunkDetector() { } |
25 |
|
|
uint64_t FindNextCutMark(BlockItem *block); |
26 |
|
|
|
27 |
|
|
virtual bool MightFindChunks(uint64_t size) const = 0; |
28 |
|
|
|
29 |
|
|
protected: |
30 |
|
|
virtual uint64_t DoFindNextCutMark(BlockItem *block) = 0; |
31 |
|
|
|
32 |
|
|
/** |
33 |
|
|
* When returning from an implemented FindNextCutMark call you must call this |
34 |
|
|
* function when a cut mark has been found. |
35 |
|
|
* Like: return DoCut(found_offset) |
36 |
|
|
*/ |
37 |
|
108232 |
virtual uint64_t DoCut(uint64_t offset) { |
38 |
|
108232 |
last_cut_ = offset; |
39 |
|
108232 |
return offset; |
40 |
|
|
} |
41 |
|
|
|
42 |
|
|
/** |
43 |
|
|
* Same as DoCut() but if no cut mark has been found in the given buffer in |
44 |
|
|
* FindNextCutMark() |
45 |
|
|
*/ |
46 |
|
39055 |
virtual uint64_t NoCut(uint64_t /* offset */) { return 0; } |
47 |
|
|
|
48 |
|
199755 |
uint64_t last_cut() const { return last_cut_; } |
49 |
|
354728 |
uint64_t offset() const { return offset_; } |
50 |
|
|
|
51 |
|
|
private: |
52 |
|
|
uint64_t last_cut_; |
53 |
|
|
uint64_t offset_; |
54 |
|
|
}; |
55 |
|
|
|
56 |
|
|
|
57 |
|
|
/** |
58 |
|
|
* The StaticOffsetDetector cuts files on a hard threshold and generates |
59 |
|
|
* uniform sized Chunks. |
60 |
|
|
*/ |
61 |
|
|
class StaticOffsetDetector : public ChunkDetector { |
62 |
|
|
public: |
63 |
|
1 |
explicit StaticOffsetDetector(uint64_t s) : chunk_size_(s) { } |
64 |
|
2 |
bool MightFindChunks(uint64_t size) const { return size > chunk_size_; } |
65 |
|
|
|
66 |
|
|
protected: |
67 |
|
|
virtual uint64_t DoFindNextCutMark(BlockItem *buffer); |
68 |
|
|
|
69 |
|
|
private: |
70 |
|
|
const uint64_t chunk_size_; |
71 |
|
|
}; |
72 |
|
|
|
73 |
|
|
|
74 |
|
|
/** |
75 |
|
|
* The xor32 rolling used in Igor-FS [1]. |
76 |
|
|
* |
77 |
|
|
* It takes a byte stream and constantly computes a 32-bit checksum in a way |
78 |
|
|
* that the result is only dependent on the last read 32 bytes of the stream. |
79 |
|
|
* Given random input data, the checksum eventually produces each number in the |
80 |
|
|
* 32-bit value range with roughly the same probability. |
81 |
|
|
* We exploit that by constantly checking the rolling checksum result for a |
82 |
|
|
* specific interval. Thus, we detect characteristic 32-byte long patches in a |
83 |
|
|
* file that do not depend on their actual position inside the data stream. |
84 |
|
|
* Thereby local modifications of a file might not affect global chunk creation. |
85 |
|
|
* |
86 |
|
|
* [1] "The Decentralized File System Igor-FS |
87 |
|
|
* as an Application for Overlay-Networks" |
88 |
|
|
* Dissertation of Dipl.-Ing. Kendy Kutzner (2008) |
89 |
|
|
*/ |
90 |
|
|
class Xor32Detector : public ChunkDetector { |
91 |
|
|
FRIEND_TEST(T_ChunkDetectors, Xor32); |
92 |
|
|
|
93 |
|
|
public: |
94 |
|
|
Xor32Detector(const uint64_t minimal_chunk_size, |
95 |
|
|
const uint64_t average_chunk_size, |
96 |
|
|
const uint64_t maximal_chunk_size); |
97 |
|
|
|
98 |
|
249849 |
bool MightFindChunks(const uint64_t size) const { |
99 |
|
249849 |
return size > minimal_chunk_size_; |
100 |
|
|
} |
101 |
|
|
|
102 |
|
|
protected: |
103 |
|
|
virtual uint64_t DoFindNextCutMark(BlockItem *buffer); |
104 |
|
|
|
105 |
|
5831 |
virtual uint64_t DoCut(const uint64_t offset) { |
106 |
|
5831 |
xor32_ = 0; |
107 |
|
5831 |
xor32_ptr_ = offset; |
108 |
|
5831 |
return ChunkDetector::DoCut(offset); |
109 |
|
|
} |
110 |
|
|
|
111 |
|
38953 |
virtual uint64_t NoCut(const uint64_t offset) { |
112 |
|
38953 |
xor32_ptr_ = offset; |
113 |
|
38953 |
return ChunkDetector::NoCut(offset); |
114 |
|
|
} |
115 |
|
|
|
116 |
|
492914805 |
inline void xor32(const unsigned char byte) { |
117 |
|
492914805 |
xor32_ = (xor32_ << 1) ^ byte; |
118 |
|
492914805 |
} |
119 |
|
|
|
120 |
|
492675366 |
inline bool CheckThreshold() { |
121 |
|
492675366 |
return abs(static_cast<int32_t>(xor32_) - kMagicNumber) < threshold_; |
122 |
|
|
} |
123 |
|
|
|
124 |
|
|
private: |
125 |
|
|
// xor32 only depends on a window of the last 32 bytes in the data stream |
126 |
|
|
static const unsigned kXor32Window = 32; |
127 |
|
|
static const int32_t kMagicNumber; |
128 |
|
|
|
129 |
|
|
const uint64_t minimal_chunk_size_; |
130 |
|
|
const uint64_t average_chunk_size_; |
131 |
|
|
const uint64_t maximal_chunk_size_; |
132 |
|
|
const int32_t threshold_; |
133 |
|
|
|
134 |
|
|
uint64_t xor32_ptr_; |
135 |
|
|
uint32_t xor32_; |
136 |
|
|
}; |
137 |
|
|
|
138 |
|
|
#endif // CVMFS_INGESTION_CHUNK_DETECTOR_H_ |
139 |
|
|
|