contrib/linearize/linearize-data.py

   1 #!/usr/bin/python
   2 #
   3 # linearize-data.py: Construct a linear, no-fork version of the chain.
   4 #
   5 # Copyright (c) 2013-2014 The Bitcoin Core developers
   6 # Distributed under the MIT software license, see the accompanying
   7 # file COPYING or http://www.opensource.org/licenses/mit-license.php.
   8 #
   9
  10 from __future__ import print_function, division
  11 import json
  12 import struct
  13 import re
  14 import os
  15 import base64
  16 import httplib
  17 import sys
  18 import hashlib
  19 import datetime
  20 import time
  21 from collections import namedtuple
  22
  23 settings = {}
  24
  25 def uint32(x):
  26         return x & 0xffffffffL
  27
  28 def bytereverse(x):
  29         return uint32(( ((x) << 24) | (((x) << 8) & 0x00ff0000) |
  30                        (((x) >> 8) & 0x0000ff00) | ((x) >> 24) ))
  31
  32 def bufreverse(in_buf):
  33         out_words = []
  34         for i in range(0, len(in_buf), 4):
  35                 word = struct.unpack('@I', in_buf[i:i+4])[0]
  36                 out_words.append(struct.pack('@I', bytereverse(word)))
  37         return ''.join(out_words)
  38
  39 def wordreverse(in_buf):
  40         out_words = []
  41         for i in range(0, len(in_buf), 4):
  42                 out_words.append(in_buf[i:i+4])
  43         out_words.reverse()
  44         return ''.join(out_words)
  45
  46 def calc_hdr_hash(blk_hdr):
  47         hash1 = hashlib.sha256()
  48         hash1.update(blk_hdr)
  49         hash1_o = hash1.digest()
  50
  51         hash2 = hashlib.sha256()
  52         hash2.update(hash1_o)
  53         hash2_o = hash2.digest()
  54
  55         return hash2_o
  56
  57 def calc_hash_str(blk_hdr):
  58         hash = calc_hdr_hash(blk_hdr)
  59         hash = bufreverse(hash)
  60         hash = wordreverse(hash)
  61         hash_str = hash.encode('hex')
  62         return hash_str
  63
  64 def get_blk_dt(blk_hdr):
  65         members = struct.unpack("<I", blk_hdr[68:68+4])
  66         nTime = members[0]
  67         dt = datetime.datetime.fromtimestamp(nTime)
  68         dt_ym = datetime.datetime(dt.year, dt.month, 1)
  69         return (dt_ym, nTime)
  70
  71 def get_block_hashes(settings):
  72         blkindex = []
  73         f = open(settings['hashlist'], "r")
  74         for line in f:
  75                 line = line.rstrip()
  76                 blkindex.append(line)
  77
  78         print("Read " + str(len(blkindex)) + " hashes")
  79
  80         return blkindex
  81
  82 def mkblockmap(blkindex):
  83         blkmap = {}
  84         for height,hash in enumerate(blkindex):
  85                 blkmap[hash] = height
  86         return blkmap
  87
  88 # Block header and extent on disk
  89 BlockExtent = namedtuple('BlockExtent', ['fn', 'offset', 'inhdr', 'blkhdr', 'size'])
  90
  91 class BlockDataCopier:
  92         def __init__(self, settings, blkindex, blkmap):
  93                 self.settings = settings
  94                 self.blkindex = blkindex
  95                 self.blkmap = blkmap
  96
  97                 self.inFn = 0
  98                 self.inF = None
  99                 self.outFn = 0
 100                 self.outsz = 0
 101                 self.outF = None
 102                 self.outFname = None
 103                 self.blkCountIn = 0
 104                 self.blkCountOut = 0
 105
 106                 self.lastDate = datetime.datetime(2000, 1, 1)
 107                 self.highTS = 1408893517 - 315360000
 108                 self.timestampSplit = False
 109                 self.fileOutput = True
 110                 self.setFileTime = False
 111                 self.maxOutSz = settings['max_out_sz']
 112                 if 'output' in settings:
 113                         self.fileOutput = False
 114                 if settings['file_timestamp'] != 0:
 115                         self.setFileTime = True
 116                 if settings['split_timestamp'] != 0:
 117                         self.timestampSplit = True
 118         # Extents and cache for out-of-order blocks
 119                 self.blockExtents = {}
 120                 self.outOfOrderData = {}
 121                 self.outOfOrderSize = 0 # running total size for items in outOfOrderData
 122
 123         def writeBlock(self, inhdr, blk_hdr, rawblock):
 124                 if not self.fileOutput and ((self.outsz + self.inLen) > self.maxOutSz):
 125                         self.outF.close()
 126                         if self.setFileTime:
 127                                 os.utime(outFname, (int(time.time()), highTS))
 128                         self.outF = None
 129                         self.outFname = None
 130                         self.outFn = outFn + 1
 131                         self.outsz = 0
 132
 133                 (blkDate, blkTS) = get_blk_dt(blk_hdr)
 134                 if self.timestampSplit and (blkDate > self.lastDate):
 135                         print("New month " + blkDate.strftime("%Y-%m") + " @ " + hash_str)
 136                         lastDate = blkDate
 137                         if outF:
 138                                 outF.close()
 139                                 if setFileTime:
 140                                         os.utime(outFname, (int(time.time()), highTS))
 141                                 self.outF = None
 142                                 self.outFname = None
 143                                 self.outFn = self.outFn + 1
 144                                 self.outsz = 0
 145
 146                 if not self.outF:
 147                         if self.fileOutput:
 148                                 outFname = self.settings['output_file']
 149                         else:
 150                                 outFname = "%s/blk%05d.dat" % (self.settings['output'], outFn)
 151                         print("Output file " + outFname)
 152                         self.outF = open(outFname, "wb")
 153
 154                 self.outF.write(inhdr)
 155                 self.outF.write(blk_hdr)
 156                 self.outF.write(rawblock)
 157                 self.outsz = self.outsz + len(inhdr) + len(blk_hdr) + len(rawblock)
 158
 159                 self.blkCountOut = self.blkCountOut + 1
 160                 if blkTS > self.highTS:
 161                         self.highTS = blkTS
 162
 163                 if (self.blkCountOut % 1000) == 0:
 164                         print('%i blocks scanned, %i blocks written (of %i, %.1f%% complete)' %
 165                                         (self.blkCountIn, self.blkCountOut, len(self.blkindex), 100.0 * self.blkCountOut / len(self.blkindex)))
 166
 167         def inFileName(self, fn):
 168                 return "%s/blk%05d.dat" % (self.settings['input'], fn)
 169
 170         def fetchBlock(self, extent):
 171                 '''Fetch block contents from disk given extents'''
 172                 with open(self.inFileName(extent.fn), "rb") as f:
 173                         f.seek(extent.offset)
 174                         return f.read(extent.size)
 175
 176         def copyOneBlock(self):
 177                 '''Find the next block to be written in the input, and copy it to the output.'''
 178                 extent = self.blockExtents.pop(self.blkCountOut)
 179                 if self.blkCountOut in self.outOfOrderData:
 180                         # If the data is cached, use it from memory and remove from the cache
 181                         rawblock = self.outOfOrderData.pop(self.blkCountOut)
 182                         self.outOfOrderSize -= len(rawblock)
 183                 else: # Otherwise look up data on disk
 184                         rawblock = self.fetchBlock(extent)
 185
 186                 self.writeBlock(extent.inhdr, extent.blkhdr, rawblock)
 187
 188         def run(self):
 189                 while self.blkCountOut < len(self.blkindex):
 190                         if not self.inF:
 191                                 fname = self.inFileName(self.inFn)
 192                                 print("Input file " + fname)
 193                                 try:
 194                                         self.inF = open(fname, "rb")
 195                                 except IOError:
 196                                         print("Premature end of block data")
 197                                         return
 198
 199                         inhdr = self.inF.read(8)
 200                         if (not inhdr or (inhdr[0] == "\0")):
 201                                 self.inF.close()
 202                                 self.inF = None
 203                                 self.inFn = self.inFn + 1
 204                                 continue
 205
 206                         inMagic = inhdr[:4]
 207                         if (inMagic != self.settings['netmagic']):
 208                                 print("Invalid magic: " + inMagic.encode('hex'))
 209                                 return
 210                         inLenLE = inhdr[4:]
 211                         su = struct.unpack("<I", inLenLE)
 212                         inLen = su[0] - 80 # length without header
 213                         blk_hdr = self.inF.read(80)
 214                         inExtent = BlockExtent(self.inFn, self.inF.tell(), inhdr, blk_hdr, inLen)
 215
 216                         hash_str = calc_hash_str(blk_hdr)
 217                         if not hash_str in blkmap:
 218                                 print("Skipping unknown block " + hash_str)
 219                                 self.inF.seek(inLen, os.SEEK_CUR)
 220                                 continue
 221
 222                         blkHeight = self.blkmap[hash_str]
 223                         self.blkCountIn += 1
 224
 225                         if self.blkCountOut == blkHeight:
 226                                 # If in-order block, just copy
 227                                 rawblock = self.inF.read(inLen)
 228                                 self.writeBlock(inhdr, blk_hdr, rawblock)
 229
 230                                 # See if we can catch up to prior out-of-order blocks
 231                                 while self.blkCountOut in self.blockExtents:
 232                                         self.copyOneBlock()
 233
 234                         else: # If out-of-order, skip over block data for now
 235                                 self.blockExtents[blkHeight] = inExtent
 236                                 if self.outOfOrderSize < self.settings['out_of_order_cache_sz']:
 237                                         # If there is space in the cache, read the data
 238                                         # Reading the data in file sequence instead of seeking and fetching it later is preferred,
 239                                         # but we don't want to fill up memory
 240                                         self.outOfOrderData[blkHeight] = self.inF.read(inLen)
 241                                         self.outOfOrderSize += inLen
 242                                 else: # If no space in cache, seek forward
 243                                         self.inF.seek(inLen, os.SEEK_CUR)
 244
 245                 print("Done (%i blocks written)" % (self.blkCountOut))
 246
 247 if __name__ == '__main__':
 248         if len(sys.argv) != 2:
 249                 print("Usage: linearize-data.py CONFIG-FILE")
 250                 sys.exit(1)
 251
 252         f = open(sys.argv[1])
 253         for line in f:
 254                 # skip comment lines
 255                 m = re.search('^\s*#', line)
 256                 if m:
 257                         continue
 258
 259                 # parse key=value lines
 260                 m = re.search('^(\w+)\s*=\s*(\S.*)$', line)
 261                 if m is None:
 262                         continue
 263                 settings[m.group(1)] = m.group(2)
 264         f.close()
 265
 266         if 'netmagic' not in settings:
 267                 settings['netmagic'] = 'f9beb4d9'
 268         if 'genesis' not in settings:
 269                 settings['genesis'] = '000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f'
 270         if 'input' not in settings:
 271                 settings['input'] = 'input'
 272         if 'hashlist' not in settings:
 273                 settings['hashlist'] = 'hashlist.txt'
 274         if 'file_timestamp' not in settings:
 275                 settings['file_timestamp'] = 0
 276         if 'split_timestamp' not in settings:
 277                 settings['split_timestamp'] = 0
 278         if 'max_out_sz' not in settings:
 279                 settings['max_out_sz'] = 1000L * 1000 * 1000
 280         if 'out_of_order_cache_sz' not in settings:
 281                 settings['out_of_order_cache_sz'] = 100 * 1000 * 1000
 282
 283         settings['max_out_sz'] = long(settings['max_out_sz'])
 284         settings['split_timestamp'] = int(settings['split_timestamp'])
 285         settings['file_timestamp'] = int(settings['file_timestamp'])
 286         settings['netmagic'] = settings['netmagic'].decode('hex')
 287         settings['out_of_order_cache_sz'] = int(settings['out_of_order_cache_sz'])
 288
 289         if 'output_file' not in settings and 'output' not in settings:
 290                 print("Missing output file / directory")
 291                 sys.exit(1)
 292
 293         blkindex = get_block_hashes(settings)
 294         blkmap = mkblockmap(blkindex)
 295
 296         if not settings['genesis'] in blkmap:
 297                 print("Genesis block not found in hashlist")
 298         else:
 299                 BlockDataCopier(settings, blkindex, blkmap).run()
 300
 301