| Line |
Branch |
Exec |
Source |
| 1 |
|
|
/** |
| 2 |
|
|
* This file is part of the CernVM File System. |
| 3 |
|
|
*/ |
| 4 |
|
|
|
| 5 |
|
|
#ifndef CVMFS_INGESTION_CHUNK_DETECTOR_H_ |
| 6 |
|
|
#define CVMFS_INGESTION_CHUNK_DETECTOR_H_ |
| 7 |
|
|
|
| 8 |
|
|
#include "duplex_testing.h" |
| 9 |
|
|
#include <stdint.h> |
| 10 |
|
|
|
| 11 |
|
|
#include <algorithm> |
| 12 |
|
|
#include <cstdlib> |
| 13 |
|
|
|
| 14 |
|
|
class BlockItem; |
| 15 |
|
|
|
| 16 |
|
|
/** |
| 17 |
|
|
* Abstract base class for a cutmark detector. This decides on which file |
| 18 |
|
|
* positions a File should be chunked. |
| 19 |
|
|
*/ |
| 20 |
|
|
class ChunkDetector { |
| 21 |
|
|
public: |
| 22 |
|
4577566 |
ChunkDetector() : last_cut_(0), offset_(0) { } |
| 23 |
|
9149382 |
virtual ~ChunkDetector() { } |
| 24 |
|
|
uint64_t FindNextCutMark(BlockItem *block); |
| 25 |
|
|
|
| 26 |
|
|
virtual bool MightFindChunks(uint64_t size) const = 0; |
| 27 |
|
|
|
| 28 |
|
|
protected: |
| 29 |
|
|
virtual uint64_t DoFindNextCutMark(BlockItem *block) = 0; |
| 30 |
|
|
|
| 31 |
|
|
/** |
| 32 |
|
|
* When returning from an implemented FindNextCutMark call you must call this |
| 33 |
|
|
* function when a cut mark has been found. |
| 34 |
|
|
* Like: return DoCut(found_offset) |
| 35 |
|
|
*/ |
| 36 |
|
5146001 |
virtual uint64_t DoCut(uint64_t offset) { |
| 37 |
|
5146001 |
last_cut_ = offset; |
| 38 |
|
5146001 |
return offset; |
| 39 |
|
|
} |
| 40 |
|
|
|
| 41 |
|
|
/** |
| 42 |
|
|
* Same as DoCut() but if no cut mark has been found in the given buffer in |
| 43 |
|
|
* FindNextCutMark() |
| 44 |
|
|
*/ |
| 45 |
|
774028 |
virtual uint64_t NoCut(uint64_t /* offset */) { return 0; } |
| 46 |
|
|
|
| 47 |
|
6980909 |
uint64_t last_cut() const { return last_cut_; } |
| 48 |
|
13064539 |
uint64_t offset() const { return offset_; } |
| 49 |
|
|
|
| 50 |
|
|
private: |
| 51 |
|
|
uint64_t last_cut_; |
| 52 |
|
|
uint64_t offset_; |
| 53 |
|
|
}; |
| 54 |
|
|
|
| 55 |
|
|
|
| 56 |
|
|
/** |
| 57 |
|
|
* The StaticOffsetDetector cuts files on a hard threshold and generates |
| 58 |
|
|
* uniform sized Chunks. |
| 59 |
|
|
*/ |
| 60 |
|
|
class StaticOffsetDetector : public ChunkDetector { |
| 61 |
|
|
public: |
| 62 |
|
49 |
explicit StaticOffsetDetector(uint64_t s) : chunk_size_(s) { } |
| 63 |
|
98 |
bool MightFindChunks(uint64_t size) const { return size > chunk_size_; } |
| 64 |
|
|
|
| 65 |
|
|
protected: |
| 66 |
|
|
virtual uint64_t DoFindNextCutMark(BlockItem *buffer); |
| 67 |
|
|
|
| 68 |
|
|
private: |
| 69 |
|
|
const uint64_t chunk_size_; |
| 70 |
|
|
}; |
| 71 |
|
|
|
| 72 |
|
|
|
| 73 |
|
|
/** |
| 74 |
|
|
* The xor32 rolling used in Igor-FS [1]. |
| 75 |
|
|
* |
| 76 |
|
|
* It takes a byte stream and constantly computes a 32-bit checksum in a way |
| 77 |
|
|
* that the result is only dependent on the last read 32 bytes of the stream. |
| 78 |
|
|
* Given random input data, the checksum eventually produces each number in the |
| 79 |
|
|
* 32-bit value range with roughly the same probability. |
| 80 |
|
|
* We exploit that by constantly checking the rolling checksum result for a |
| 81 |
|
|
* specific interval. Thus, we detect characteristic 32-byte long patches in a |
| 82 |
|
|
* file that do not depend on their actual position inside the data stream. |
| 83 |
|
|
* Thereby local modifications of a file might not affect global chunk creation. |
| 84 |
|
|
* |
| 85 |
|
|
* [1] "The Decentralized File System Igor-FS |
| 86 |
|
|
* as an Application for Overlay-Networks" |
| 87 |
|
|
* Dissertation of Dipl.-Ing. Kendy Kutzner (2008) |
| 88 |
|
|
*/ |
| 89 |
|
|
class Xor32Detector : public ChunkDetector { |
| 90 |
|
|
FRIEND_TEST(T_ChunkDetectors, Xor32); |
| 91 |
|
|
|
| 92 |
|
|
public: |
| 93 |
|
|
Xor32Detector(const uint64_t minimal_chunk_size, |
| 94 |
|
|
const uint64_t average_chunk_size, |
| 95 |
|
|
const uint64_t maximal_chunk_size); |
| 96 |
|
|
|
| 97 |
|
4498670 |
bool MightFindChunks(const uint64_t size) const { |
| 98 |
|
4498670 |
return size > minimal_chunk_size_; |
| 99 |
|
|
} |
| 100 |
|
|
|
| 101 |
|
|
protected: |
| 102 |
|
|
virtual uint64_t DoFindNextCutMark(BlockItem *buffer); |
| 103 |
|
|
|
| 104 |
|
128352 |
virtual uint64_t DoCut(const uint64_t offset) { |
| 105 |
|
128352 |
xor32_ = 0; |
| 106 |
|
128352 |
xor32_ptr_ = offset; |
| 107 |
|
128352 |
return ChunkDetector::DoCut(offset); |
| 108 |
|
|
} |
| 109 |
|
|
|
| 110 |
|
769053 |
virtual uint64_t NoCut(const uint64_t offset) { |
| 111 |
|
769053 |
xor32_ptr_ = offset; |
| 112 |
|
769053 |
return ChunkDetector::NoCut(offset); |
| 113 |
|
|
} |
| 114 |
|
|
|
| 115 |
|
17762461244 |
inline void xor32(const unsigned char byte) { xor32_ = (xor32_ << 1) ^ byte; } |
| 116 |
|
|
|
| 117 |
|
17756389274 |
inline bool CheckThreshold() { |
| 118 |
|
17756389274 |
return abs(static_cast<int32_t>(xor32_) - kMagicNumber) < threshold_; |
| 119 |
|
|
} |
| 120 |
|
|
|
| 121 |
|
|
private: |
| 122 |
|
|
// xor32 only depends on a window of the last 32 bytes in the data stream |
| 123 |
|
|
static const unsigned kXor32Window = 32; |
| 124 |
|
|
static const int32_t kMagicNumber; |
| 125 |
|
|
|
| 126 |
|
|
const uint64_t minimal_chunk_size_; |
| 127 |
|
|
const uint64_t average_chunk_size_; |
| 128 |
|
|
const uint64_t maximal_chunk_size_; |
| 129 |
|
|
const int32_t threshold_; |
| 130 |
|
|
|
| 131 |
|
|
uint64_t xor32_ptr_; |
| 132 |
|
|
uint32_t xor32_; |
| 133 |
|
|
}; |
| 134 |
|
|
|
| 135 |
|
|
#endif // CVMFS_INGESTION_CHUNK_DETECTOR_H_ |
| 136 |
|
|
|