]> Git Repo - VerusCoin.git/blob - contrib/linearize/linearize-data.py
Merge pull request #6168
[VerusCoin.git] / contrib / linearize / linearize-data.py
1 #!/usr/bin/python
2 #
3 # linearize-data.py: Construct a linear, no-fork version of the chain.
4 #
5 # Copyright (c) 2013-2014 The Bitcoin Core developers
6 # Distributed under the MIT software license, see the accompanying
7 # file COPYING or http://www.opensource.org/licenses/mit-license.php.
8 #
9
10 from __future__ import print_function, division
11 import json
12 import struct
13 import re
14 import os
15 import base64
16 import httplib
17 import sys
18 import hashlib
19 import datetime
20 import time
21 from collections import namedtuple
22
23 settings = {}
24
25 def uint32(x):
26         return x & 0xffffffffL
27
28 def bytereverse(x):
29         return uint32(( ((x) << 24) | (((x) << 8) & 0x00ff0000) |
30                        (((x) >> 8) & 0x0000ff00) | ((x) >> 24) ))
31
32 def bufreverse(in_buf):
33         out_words = []
34         for i in range(0, len(in_buf), 4):
35                 word = struct.unpack('@I', in_buf[i:i+4])[0]
36                 out_words.append(struct.pack('@I', bytereverse(word)))
37         return ''.join(out_words)
38
39 def wordreverse(in_buf):
40         out_words = []
41         for i in range(0, len(in_buf), 4):
42                 out_words.append(in_buf[i:i+4])
43         out_words.reverse()
44         return ''.join(out_words)
45
46 def calc_hdr_hash(blk_hdr):
47         hash1 = hashlib.sha256()
48         hash1.update(blk_hdr)
49         hash1_o = hash1.digest()
50
51         hash2 = hashlib.sha256()
52         hash2.update(hash1_o)
53         hash2_o = hash2.digest()
54
55         return hash2_o
56
57 def calc_hash_str(blk_hdr):
58         hash = calc_hdr_hash(blk_hdr)
59         hash = bufreverse(hash)
60         hash = wordreverse(hash)
61         hash_str = hash.encode('hex')
62         return hash_str
63
64 def get_blk_dt(blk_hdr):
65         members = struct.unpack("<I", blk_hdr[68:68+4])
66         nTime = members[0]
67         dt = datetime.datetime.fromtimestamp(nTime)
68         dt_ym = datetime.datetime(dt.year, dt.month, 1)
69         return (dt_ym, nTime)
70
71 def get_block_hashes(settings):
72         blkindex = []
73         f = open(settings['hashlist'], "r")
74         for line in f:
75                 line = line.rstrip()
76                 blkindex.append(line)
77
78         print("Read " + str(len(blkindex)) + " hashes")
79
80         return blkindex
81
82 def mkblockmap(blkindex):
83         blkmap = {}
84         for height,hash in enumerate(blkindex):
85                 blkmap[hash] = height
86         return blkmap
87
88 # Block header and extent on disk
89 BlockExtent = namedtuple('BlockExtent', ['fn', 'offset', 'inhdr', 'blkhdr', 'size'])
90
91 class BlockDataCopier:
92         def __init__(self, settings, blkindex, blkmap):
93                 self.settings = settings
94                 self.blkindex = blkindex
95                 self.blkmap = blkmap
96
97                 self.inFn = 0
98                 self.inF = None
99                 self.outFn = 0
100                 self.outsz = 0
101                 self.outF = None
102                 self.outFname = None
103                 self.blkCountIn = 0
104                 self.blkCountOut = 0
105
106                 self.lastDate = datetime.datetime(2000, 1, 1)
107                 self.highTS = 1408893517 - 315360000
108                 self.timestampSplit = False
109                 self.fileOutput = True
110                 self.setFileTime = False
111                 self.maxOutSz = settings['max_out_sz']
112                 if 'output' in settings:
113                         self.fileOutput = False
114                 if settings['file_timestamp'] != 0:
115                         self.setFileTime = True
116                 if settings['split_timestamp'] != 0:
117                         self.timestampSplit = True
118         # Extents and cache for out-of-order blocks
119                 self.blockExtents = {}
120                 self.outOfOrderData = {}
121                 self.outOfOrderSize = 0 # running total size for items in outOfOrderData
122
123         def writeBlock(self, inhdr, blk_hdr, rawblock):
124                 if not self.fileOutput and ((self.outsz + self.inLen) > self.maxOutSz):
125                         self.outF.close()
126                         if self.setFileTime:
127                                 os.utime(outFname, (int(time.time()), highTS))
128                         self.outF = None
129                         self.outFname = None
130                         self.outFn = outFn + 1
131                         self.outsz = 0
132
133                 (blkDate, blkTS) = get_blk_dt(blk_hdr)
134                 if self.timestampSplit and (blkDate > self.lastDate):
135                         print("New month " + blkDate.strftime("%Y-%m") + " @ " + hash_str)
136                         lastDate = blkDate
137                         if outF:
138                                 outF.close()
139                                 if setFileTime:
140                                         os.utime(outFname, (int(time.time()), highTS))
141                                 self.outF = None
142                                 self.outFname = None
143                                 self.outFn = self.outFn + 1
144                                 self.outsz = 0
145
146                 if not self.outF:
147                         if self.fileOutput:
148                                 outFname = self.settings['output_file']
149                         else:
150                                 outFname = "%s/blk%05d.dat" % (self.settings['output'], outFn)
151                         print("Output file " + outFname)
152                         self.outF = open(outFname, "wb")
153
154                 self.outF.write(inhdr)
155                 self.outF.write(blk_hdr)
156                 self.outF.write(rawblock)
157                 self.outsz = self.outsz + len(inhdr) + len(blk_hdr) + len(rawblock)
158
159                 self.blkCountOut = self.blkCountOut + 1
160                 if blkTS > self.highTS:
161                         self.highTS = blkTS
162
163                 if (self.blkCountOut % 1000) == 0:
164                         print('%i blocks scanned, %i blocks written (of %i, %.1f%% complete)' % 
165                                         (self.blkCountIn, self.blkCountOut, len(self.blkindex), 100.0 * self.blkCountOut / len(self.blkindex)))
166
167         def inFileName(self, fn):
168                 return "%s/blk%05d.dat" % (self.settings['input'], fn)
169
170         def fetchBlock(self, extent):
171                 '''Fetch block contents from disk given extents'''
172                 with open(self.inFileName(extent.fn), "rb") as f:
173                         f.seek(extent.offset)
174                         return f.read(extent.size)
175
176         def copyOneBlock(self):
177                 '''Find the next block to be written in the input, and copy it to the output.'''
178                 extent = self.blockExtents.pop(self.blkCountOut)
179                 if self.blkCountOut in self.outOfOrderData:
180                         # If the data is cached, use it from memory and remove from the cache
181                         rawblock = self.outOfOrderData.pop(self.blkCountOut)
182                         self.outOfOrderSize -= len(rawblock)
183                 else: # Otherwise look up data on disk
184                         rawblock = self.fetchBlock(extent)
185
186                 self.writeBlock(extent.inhdr, extent.blkhdr, rawblock)
187
188         def run(self):
189                 while self.blkCountOut < len(self.blkindex):
190                         if not self.inF:
191                                 fname = self.inFileName(self.inFn)
192                                 print("Input file " + fname)
193                                 try:
194                                         self.inF = open(fname, "rb")
195                                 except IOError:
196                                         print("Premature end of block data")
197                                         return
198
199                         inhdr = self.inF.read(8)
200                         if (not inhdr or (inhdr[0] == "\0")):
201                                 self.inF.close()
202                                 self.inF = None
203                                 self.inFn = self.inFn + 1
204                                 continue
205
206                         inMagic = inhdr[:4]
207                         if (inMagic != self.settings['netmagic']):
208                                 print("Invalid magic: " + inMagic.encode('hex'))
209                                 return
210                         inLenLE = inhdr[4:]
211                         su = struct.unpack("<I", inLenLE)
212                         inLen = su[0] - 80 # length without header
213                         blk_hdr = self.inF.read(80)
214                         inExtent = BlockExtent(self.inFn, self.inF.tell(), inhdr, blk_hdr, inLen)
215
216                         hash_str = calc_hash_str(blk_hdr)
217                         if not hash_str in blkmap:
218                                 print("Skipping unknown block " + hash_str)
219                                 self.inF.seek(inLen, os.SEEK_CUR)
220                                 continue
221
222                         blkHeight = self.blkmap[hash_str]
223                         self.blkCountIn += 1
224
225                         if self.blkCountOut == blkHeight:
226                                 # If in-order block, just copy
227                                 rawblock = self.inF.read(inLen)
228                                 self.writeBlock(inhdr, blk_hdr, rawblock)
229
230                                 # See if we can catch up to prior out-of-order blocks
231                                 while self.blkCountOut in self.blockExtents:
232                                         self.copyOneBlock()
233
234                         else: # If out-of-order, skip over block data for now
235                                 self.blockExtents[blkHeight] = inExtent
236                                 if self.outOfOrderSize < self.settings['out_of_order_cache_sz']:
237                                         # If there is space in the cache, read the data
238                                         # Reading the data in file sequence instead of seeking and fetching it later is preferred,
239                                         # but we don't want to fill up memory
240                                         self.outOfOrderData[blkHeight] = self.inF.read(inLen)
241                                         self.outOfOrderSize += inLen
242                                 else: # If no space in cache, seek forward
243                                         self.inF.seek(inLen, os.SEEK_CUR)
244
245                 print("Done (%i blocks written)" % (self.blkCountOut))
246
247 if __name__ == '__main__':
248         if len(sys.argv) != 2:
249                 print("Usage: linearize-data.py CONFIG-FILE")
250                 sys.exit(1)
251
252         f = open(sys.argv[1])
253         for line in f:
254                 # skip comment lines
255                 m = re.search('^\s*#', line)
256                 if m:
257                         continue
258
259                 # parse key=value lines
260                 m = re.search('^(\w+)\s*=\s*(\S.*)$', line)
261                 if m is None:
262                         continue
263                 settings[m.group(1)] = m.group(2)
264         f.close()
265
266         if 'netmagic' not in settings:
267                 settings['netmagic'] = 'f9beb4d9'
268         if 'genesis' not in settings:
269                 settings['genesis'] = '000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f'
270         if 'input' not in settings:
271                 settings['input'] = 'input'
272         if 'hashlist' not in settings:
273                 settings['hashlist'] = 'hashlist.txt'
274         if 'file_timestamp' not in settings:
275                 settings['file_timestamp'] = 0
276         if 'split_timestamp' not in settings:
277                 settings['split_timestamp'] = 0
278         if 'max_out_sz' not in settings:
279                 settings['max_out_sz'] = 1000L * 1000 * 1000
280         if 'out_of_order_cache_sz' not in settings:
281                 settings['out_of_order_cache_sz'] = 100 * 1000 * 1000
282
283         settings['max_out_sz'] = long(settings['max_out_sz'])
284         settings['split_timestamp'] = int(settings['split_timestamp'])
285         settings['file_timestamp'] = int(settings['file_timestamp'])
286         settings['netmagic'] = settings['netmagic'].decode('hex')
287         settings['out_of_order_cache_sz'] = int(settings['out_of_order_cache_sz'])
288
289         if 'output_file' not in settings and 'output' not in settings:
290                 print("Missing output file / directory")
291                 sys.exit(1)
292
293         blkindex = get_block_hashes(settings)
294         blkmap = mkblockmap(blkindex)
295
296         if not settings['genesis'] in blkmap:
297                 print("Genesis block not found in hashlist")
298         else:
299                 BlockDataCopier(settings, blkindex, blkmap).run()
300
301
This page took 0.044394 seconds and 4 git commands to generate.