Line |
Branch |
Exec |
Source |
1 |
|
|
/** |
2 |
|
|
* This file is part of the CernVM File System. |
3 |
|
|
*/ |
4 |
|
|
|
5 |
|
|
#ifndef CVMFS_INGESTION_CHUNK_DETECTOR_H_ |
6 |
|
|
#define CVMFS_INGESTION_CHUNK_DETECTOR_H_ |
7 |
|
|
|
8 |
|
|
#include <gtest/gtest_prod.h> |
9 |
|
|
#include <stdint.h> |
10 |
|
|
|
11 |
|
|
#include <algorithm> |
12 |
|
|
#include <cstdlib> |
13 |
|
|
|
14 |
|
|
class BlockItem; |
15 |
|
|
|
16 |
|
|
/** |
17 |
|
|
* Abstract base class for a cutmark detector. This decides on which file |
18 |
|
|
* positions a File should be chunked. |
19 |
|
|
*/ |
20 |
|
|
class ChunkDetector { |
21 |
|
|
public: |
22 |
|
332587 |
ChunkDetector() : last_cut_(0), offset_(0) { } |
23 |
|
663536 |
virtual ~ChunkDetector() { } |
24 |
|
|
uint64_t FindNextCutMark(BlockItem *block); |
25 |
|
|
|
26 |
|
|
virtual bool MightFindChunks(uint64_t size) const = 0; |
27 |
|
|
|
28 |
|
|
protected: |
29 |
|
|
virtual uint64_t DoFindNextCutMark(BlockItem *block) = 0; |
30 |
|
|
|
31 |
|
|
/** |
32 |
|
|
* When returning from an implemented FindNextCutMark call you must call this |
33 |
|
|
* function when a cut mark has been found. |
34 |
|
|
* Like: return DoCut(found_offset) |
35 |
|
|
*/ |
36 |
|
2127072 |
virtual uint64_t DoCut(uint64_t offset) { |
37 |
|
2127072 |
last_cut_ = offset; |
38 |
|
2127072 |
return offset; |
39 |
|
|
} |
40 |
|
|
|
41 |
|
|
/** |
42 |
|
|
* Same as DoCut() but if no cut mark has been found in the given buffer in |
43 |
|
|
* FindNextCutMark() |
44 |
|
|
*/ |
45 |
|
200603 |
virtual uint64_t NoCut(uint64_t /* offset */) { return 0; } |
46 |
|
|
|
47 |
|
2687223 |
uint64_t last_cut() const { return last_cut_; } |
48 |
|
5096846 |
uint64_t offset() const { return offset_; } |
49 |
|
|
|
50 |
|
|
private: |
51 |
|
|
uint64_t last_cut_; |
52 |
|
|
uint64_t offset_; |
53 |
|
|
}; |
54 |
|
|
|
55 |
|
|
|
56 |
|
|
/** |
57 |
|
|
* The StaticOffsetDetector cuts files on a hard threshold and generates |
58 |
|
|
* uniform sized Chunks. |
59 |
|
|
*/ |
60 |
|
|
class StaticOffsetDetector : public ChunkDetector { |
61 |
|
|
public: |
62 |
|
20 |
explicit StaticOffsetDetector(uint64_t s) : chunk_size_(s) { } |
63 |
|
40 |
bool MightFindChunks(uint64_t size) const { return size > chunk_size_; } |
64 |
|
|
|
65 |
|
|
protected: |
66 |
|
|
virtual uint64_t DoFindNextCutMark(BlockItem *buffer); |
67 |
|
|
|
68 |
|
|
private: |
69 |
|
|
const uint64_t chunk_size_; |
70 |
|
|
}; |
71 |
|
|
|
72 |
|
|
|
73 |
|
|
/** |
74 |
|
|
* The xor32 rolling used in Igor-FS [1]. |
75 |
|
|
* |
76 |
|
|
* It takes a byte stream and constantly computes a 32-bit checksum in a way |
77 |
|
|
* that the result is only dependent on the last read 32 bytes of the stream. |
78 |
|
|
* Given random input data, the checksum eventually produces each number in the |
79 |
|
|
* 32-bit value range with roughly the same probability. |
80 |
|
|
* We exploit that by constantly checking the rolling checksum result for a |
81 |
|
|
* specific interval. Thus, we detect characteristic 32-byte long patches in a |
82 |
|
|
* file that do not depend on their actual position inside the data stream. |
83 |
|
|
* Thereby local modifications of a file might not affect global chunk creation. |
84 |
|
|
* |
85 |
|
|
* [1] "The Decentralized File System Igor-FS |
86 |
|
|
* as an Application for Overlay-Networks" |
87 |
|
|
* Dissertation of Dipl.-Ing. Kendy Kutzner (2008) |
88 |
|
|
*/ |
89 |
|
|
class Xor32Detector : public ChunkDetector { |
90 |
|
|
FRIEND_TEST(T_ChunkDetectors, Xor32); |
91 |
|
|
|
92 |
|
|
public: |
93 |
|
|
Xor32Detector(const uint64_t minimal_chunk_size, |
94 |
|
|
const uint64_t average_chunk_size, |
95 |
|
|
const uint64_t maximal_chunk_size); |
96 |
|
|
|
97 |
|
250040 |
bool MightFindChunks(const uint64_t size) const { |
98 |
|
250040 |
return size > minimal_chunk_size_; |
99 |
|
|
} |
100 |
|
|
|
101 |
|
|
protected: |
102 |
|
|
virtual uint64_t DoFindNextCutMark(BlockItem *buffer); |
103 |
|
|
|
104 |
|
79052 |
virtual uint64_t DoCut(const uint64_t offset) { |
105 |
|
79052 |
xor32_ = 0; |
106 |
|
79052 |
xor32_ptr_ = offset; |
107 |
|
79052 |
return ChunkDetector::DoCut(offset); |
108 |
|
|
} |
109 |
|
|
|
110 |
|
198547 |
virtual uint64_t NoCut(const uint64_t offset) { |
111 |
|
198547 |
xor32_ptr_ = offset; |
112 |
|
198547 |
return ChunkDetector::NoCut(offset); |
113 |
|
|
} |
114 |
|
|
|
115 |
|
5879434463 |
inline void xor32(const unsigned char byte) { xor32_ = (xor32_ << 1) ^ byte; } |
116 |
|
|
|
117 |
|
5876792600 |
inline bool CheckThreshold() { |
118 |
|
5876792600 |
return abs(static_cast<int32_t>(xor32_) - kMagicNumber) < threshold_; |
119 |
|
|
} |
120 |
|
|
|
121 |
|
|
private: |
122 |
|
|
// xor32 only depends on a window of the last 32 bytes in the data stream |
123 |
|
|
static const unsigned kXor32Window = 32; |
124 |
|
|
static const int32_t kMagicNumber; |
125 |
|
|
|
126 |
|
|
const uint64_t minimal_chunk_size_; |
127 |
|
|
const uint64_t average_chunk_size_; |
128 |
|
|
const uint64_t maximal_chunk_size_; |
129 |
|
|
const int32_t threshold_; |
130 |
|
|
|
131 |
|
|
uint64_t xor32_ptr_; |
132 |
|
|
uint32_t xor32_; |
133 |
|
|
}; |
134 |
|
|
|
135 |
|
|
#endif // CVMFS_INGESTION_CHUNK_DETECTOR_H_ |
136 |
|
|
|