3 # linearize-data.py: Construct a linear, no-fork version of the chain.
5 # Copyright (c) 2013-2014 The Bitcoin Core developers
6 # Distributed under the MIT software license, see the accompanying
7 # file COPYING or http://www.opensource.org/licenses/mit-license.php.
10 from __future__ import print_function, division
21 from collections import namedtuple
26 return x & 0xffffffffL
29 return uint32(( ((x) << 24) | (((x) << 8) & 0x00ff0000) |
30 (((x) >> 8) & 0x0000ff00) | ((x) >> 24) ))
32 def bufreverse(in_buf):
34 for i in range(0, len(in_buf), 4):
35 word = struct.unpack('@I', in_buf[i:i+4])[0]
36 out_words.append(struct.pack('@I', bytereverse(word)))
37 return ''.join(out_words)
39 def wordreverse(in_buf):
41 for i in range(0, len(in_buf), 4):
42 out_words.append(in_buf[i:i+4])
44 return ''.join(out_words)
46 def calc_hdr_hash(blk_hdr):
47 hash1 = hashlib.sha256()
49 hash1_o = hash1.digest()
51 hash2 = hashlib.sha256()
53 hash2_o = hash2.digest()
57 def calc_hash_str(blk_hdr):
58 hash = calc_hdr_hash(blk_hdr)
59 hash = bufreverse(hash)
60 hash = wordreverse(hash)
61 hash_str = hash.encode('hex')
64 def get_blk_dt(blk_hdr):
65 members = struct.unpack("<I", blk_hdr[68:68+4])
67 dt = datetime.datetime.fromtimestamp(nTime)
68 dt_ym = datetime.datetime(dt.year, dt.month, 1)
71 def get_block_hashes(settings):
73 f = open(settings['hashlist'], "r")
78 print("Read " + str(len(blkindex)) + " hashes")
82 def mkblockmap(blkindex):
84 for height,hash in enumerate(blkindex):
88 # Block header and extent on disk
89 BlockExtent = namedtuple('BlockExtent', ['fn', 'offset', 'inhdr', 'blkhdr', 'size'])
91 class BlockDataCopier:
92 def __init__(self, settings, blkindex, blkmap):
93 self.settings = settings
94 self.blkindex = blkindex
106 self.lastDate = datetime.datetime(2000, 1, 1)
107 self.highTS = 1408893517 - 315360000
108 self.timestampSplit = False
109 self.fileOutput = True
110 self.setFileTime = False
111 self.maxOutSz = settings['max_out_sz']
112 if 'output' in settings:
113 self.fileOutput = False
114 if settings['file_timestamp'] != 0:
115 self.setFileTime = True
116 if settings['split_timestamp'] != 0:
117 self.timestampSplit = True
118 # Extents and cache for out-of-order blocks
119 self.blockExtents = {}
120 self.outOfOrderData = {}
121 self.outOfOrderSize = 0 # running total size for items in outOfOrderData
123 def writeBlock(self, inhdr, blk_hdr, rawblock):
124 if not self.fileOutput and ((self.outsz + self.inLen) > self.maxOutSz):
127 os.utime(outFname, (int(time.time()), highTS))
130 self.outFn = outFn + 1
133 (blkDate, blkTS) = get_blk_dt(blk_hdr)
134 if self.timestampSplit and (blkDate > self.lastDate):
135 print("New month " + blkDate.strftime("%Y-%m") + " @ " + hash_str)
140 os.utime(outFname, (int(time.time()), highTS))
143 self.outFn = self.outFn + 1
148 outFname = self.settings['output_file']
150 outFname = "%s/blk%05d.dat" % (self.settings['output'], outFn)
151 print("Output file " + outFname)
152 self.outF = open(outFname, "wb")
154 self.outF.write(inhdr)
155 self.outF.write(blk_hdr)
156 self.outF.write(rawblock)
157 self.outsz = self.outsz + len(inhdr) + len(blk_hdr) + len(rawblock)
159 self.blkCountOut = self.blkCountOut + 1
160 if blkTS > self.highTS:
163 if (self.blkCountOut % 1000) == 0:
164 print('%i blocks scanned, %i blocks written (of %i, %.1f%% complete)' %
165 (self.blkCountIn, self.blkCountOut, len(self.blkindex), 100.0 * self.blkCountOut / len(self.blkindex)))
167 def inFileName(self, fn):
168 return "%s/blk%05d.dat" % (self.settings['input'], fn)
170 def fetchBlock(self, extent):
171 '''Fetch block contents from disk given extents'''
172 with open(self.inFileName(extent.fn), "rb") as f:
173 f.seek(extent.offset)
174 return f.read(extent.size)
176 def copyOneBlock(self):
177 '''Find the next block to be written in the input, and copy it to the output.'''
178 extent = self.blockExtents.pop(self.blkCountOut)
179 if self.blkCountOut in self.outOfOrderData:
180 # If the data is cached, use it from memory and remove from the cache
181 rawblock = self.outOfOrderData.pop(self.blkCountOut)
182 self.outOfOrderSize -= len(rawblock)
183 else: # Otherwise look up data on disk
184 rawblock = self.fetchBlock(extent)
186 self.writeBlock(extent.inhdr, extent.blkhdr, rawblock)
189 while self.blkCountOut < len(self.blkindex):
191 fname = self.inFileName(self.inFn)
192 print("Input file " + fname)
194 self.inF = open(fname, "rb")
196 print("Premature end of block data")
199 inhdr = self.inF.read(8)
200 if (not inhdr or (inhdr[0] == "\0")):
203 self.inFn = self.inFn + 1
207 if (inMagic != self.settings['netmagic']):
208 print("Invalid magic: " + inMagic.encode('hex'))
211 su = struct.unpack("<I", inLenLE)
212 inLen = su[0] - 80 # length without header
213 blk_hdr = self.inF.read(80)
214 inExtent = BlockExtent(self.inFn, self.inF.tell(), inhdr, blk_hdr, inLen)
216 hash_str = calc_hash_str(blk_hdr)
217 if not hash_str in blkmap:
218 print("Skipping unknown block " + hash_str)
219 self.inF.seek(inLen, os.SEEK_CUR)
222 blkHeight = self.blkmap[hash_str]
225 if self.blkCountOut == blkHeight:
226 # If in-order block, just copy
227 rawblock = self.inF.read(inLen)
228 self.writeBlock(inhdr, blk_hdr, rawblock)
230 # See if we can catch up to prior out-of-order blocks
231 while self.blkCountOut in self.blockExtents:
234 else: # If out-of-order, skip over block data for now
235 self.blockExtents[blkHeight] = inExtent
236 if self.outOfOrderSize < self.settings['out_of_order_cache_sz']:
237 # If there is space in the cache, read the data
238 # Reading the data in file sequence instead of seeking and fetching it later is preferred,
239 # but we don't want to fill up memory
240 self.outOfOrderData[blkHeight] = self.inF.read(inLen)
241 self.outOfOrderSize += inLen
242 else: # If no space in cache, seek forward
243 self.inF.seek(inLen, os.SEEK_CUR)
245 print("Done (%i blocks written)" % (self.blkCountOut))
247 if __name__ == '__main__':
248 if len(sys.argv) != 2:
249 print("Usage: linearize-data.py CONFIG-FILE")
252 f = open(sys.argv[1])
255 m = re.search('^\s*#', line)
259 # parse key=value lines
260 m = re.search('^(\w+)\s*=\s*(\S.*)$', line)
263 settings[m.group(1)] = m.group(2)
266 if 'netmagic' not in settings:
267 settings['netmagic'] = 'f9beb4d9'
268 if 'genesis' not in settings:
269 settings['genesis'] = '000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f'
270 if 'input' not in settings:
271 settings['input'] = 'input'
272 if 'hashlist' not in settings:
273 settings['hashlist'] = 'hashlist.txt'
274 if 'file_timestamp' not in settings:
275 settings['file_timestamp'] = 0
276 if 'split_timestamp' not in settings:
277 settings['split_timestamp'] = 0
278 if 'max_out_sz' not in settings:
279 settings['max_out_sz'] = 1000L * 1000 * 1000
280 if 'out_of_order_cache_sz' not in settings:
281 settings['out_of_order_cache_sz'] = 100 * 1000 * 1000
283 settings['max_out_sz'] = long(settings['max_out_sz'])
284 settings['split_timestamp'] = int(settings['split_timestamp'])
285 settings['file_timestamp'] = int(settings['file_timestamp'])
286 settings['netmagic'] = settings['netmagic'].decode('hex')
287 settings['out_of_order_cache_sz'] = int(settings['out_of_order_cache_sz'])
289 if 'output_file' not in settings and 'output' not in settings:
290 print("Missing output file / directory")
293 blkindex = get_block_hashes(settings)
294 blkmap = mkblockmap(blkindex)
296 if not settings['genesis'] in blkmap:
297 print("Genesis block not found in hashlist")
299 BlockDataCopier(settings, blkindex, blkmap).run()