Merge pull request #50 from medvedev1088/feature/get-block-range-for-date

Feature/get block range for date
This commit is contained in:
medvedev1088
2018-07-15 16:52:31 +07:00
committed by GitHub
10 changed files with 307 additions and 1 deletions

View File

@@ -215,6 +215,13 @@ Additional steps:
#### Command Reference
- [export_blocks_and_transactions.py](#export_blocks_and_transactionspy)
- [export_erc20_transfers.py](#export_erc20_transferspy)
- [export_receipts_and_logs.py](#export_receipts_and_logspy)
- [export_contracts.py](#export_contractspy)
- [export_erc20_tokens.py](#export_erc20_tokenspy)
- [get_block_range_for_date.py](#get_block_range_for_datepy)
All the commands accept `-h` parameter for help, e.g.:
```bash
@@ -342,6 +349,13 @@ which need to be deduplicated (see Querying in Google BigQuery section).
Upvote this pull request to make erc20_tokens export faster
https://github.com/ethereum/web3.py/pull/944#issuecomment-403957468
##### get_block_range_for_date.py
```bash
> python get_block_range_for_date.py --provider-uri=https://mainnet.infura.io/ --date 2018-01-01
4832686,4838611
```
#### Running Tests
```bash

View File

@@ -0,0 +1,58 @@
from datetime import datetime, timezone
from ethereumetl.service.graph_operations import GraphOperations, OutOfBoundsError, Point
class EthService(object):
def __init__(self, web3):
graph = BlockTimestampGraph(web3)
self._graph_operations = GraphOperations(graph)
def get_block_range_for_date(self, date):
start_datetime = datetime.combine(date, datetime.min.time(), tzinfo=timezone.utc)
end_datetime = datetime.combine(date, datetime.max.time(), tzinfo=timezone.utc)
return self.get_block_range_for_timestamps(start_datetime.timestamp(), end_datetime.timestamp())
def get_block_range_for_timestamps(self, start_timestamp, end_timestamp):
start_timestamp = int(start_timestamp)
end_timestamp = int(end_timestamp)
if start_timestamp > end_timestamp:
raise ValueError('start_timestamp must be greater or equal to end_timestamp')
try:
start_block_bounds = self._graph_operations.get_bounds_for_y_coordinate(start_timestamp)
except OutOfBoundsError:
start_block_bounds = (0, 0)
end_block_bounds = self._graph_operations.get_bounds_for_y_coordinate(end_timestamp)
if start_block_bounds == end_block_bounds and start_block_bounds[0] != start_block_bounds[1]:
raise ValueError('The given timestamp range does not cover any blocks')
start_block = start_block_bounds[1]
end_block = end_block_bounds[0]
# The genesis block has timestamp 0 but we include it with the 1st block.
if start_block == 1:
start_block = 0
return start_block, end_block
class BlockTimestampGraph(object):
def __init__(self, web3):
self._web3 = web3
def get_first_point(self):
# Ignore the genesis block as its timestamp is 0
return block_to_point(self._web3.eth.getBlock(1))
def get_last_point(self):
return block_to_point(self._web3.eth.getBlock('latest'))
def get_point(self, x):
return block_to_point(self._web3.eth.getBlock(x))
def block_to_point(block):
return Point(block.number, block.timestamp)

View File

@@ -0,0 +1,118 @@
from ethereumetl.utils import pairwise
class GraphOperations(object):
def __init__(self, graph):
"""x axis on the graph must be integers, y value must increase strictly monotonically with increase of x"""
self._graph = graph
self._cached_points = []
def get_bounds_for_y_coordinate(self, y):
"""given the y coordinate, outputs a pair of x coordinates for closest points that bound the y coordinate.
Left and right bounds are equal in case given y is equal to one of the points y coordinate"""
initial_bounds = find_best_bounds(y, self._cached_points)
if initial_bounds is None:
initial_bounds = self._get_first_point(), self._get_last_point()
result = self._get_bounds_for_y_coordinate_recursive(y, *initial_bounds)
return result
def _get_bounds_for_y_coordinate_recursive(self, y, start, end):
if y < start.y or y > end.y:
raise OutOfBoundsError('y coordinate {} is out of bounds for points {}-{}'.format(y, start, end))
if y == start.y:
return start.x, start.x
elif y == end.y:
return end.x, end.x
elif (end.x - start.x) <= 1:
return start.x, end.x
else:
assert start.y < y < end.y
if start.y >= end.y:
raise ValueError('y must increase strictly monotonically')
# Find the 1st estimation by linear interpolation from start and end points.
# If the 1st estimation is below the needed y coordinate (graph is concave),
# drop the next estimation by interpolating with the start and 1st estimation point (likely will be above the needed y).
# If 1st estimation is above the needed y coordinate (graph is convex),
# drop the next estimation by interpolating with the 1st estimation and end point (likely will be below the needed y.
# Still runs in log(n) time but about 2-3 times faster than the naive dichotomy method.
estimation1_x = interpolate(start, end, y)
estimation1_x = bound(estimation1_x, (start.x, end.x))
estimation1 = self._get_point(estimation1_x)
if estimation1.y < y:
points = (start, estimation1)
else:
points = (estimation1, end)
estimation2_x = interpolate(*points, y)
estimation2_x = bound(estimation2_x, (start.x, end.x))
estimation2 = self._get_point(estimation2_x)
all_points = [start, estimation1, estimation2, end]
bounds = find_best_bounds(y, all_points)
if bounds is None:
raise ValueError('Unable to find bounds for points {} and y coordinate {}'.format(points, y))
return self._get_bounds_for_y_coordinate_recursive(y, *bounds)
def _get_point(self, x):
point = self._graph.get_point(x)
self._cached_points.append(point)
return point
def _get_first_point(self):
point = self._graph.get_first_point()
self._cached_points.append(point)
return point
def _get_last_point(self):
point = self._graph.get_last_point()
self._cached_points.append(point)
return point
def find_best_bounds(y, points):
sorted_points = sorted(points, key=lambda point: point.y)
for point1, point2 in pairwise(sorted_points):
if point1.y <= y <= point2.y:
return point1, point2
return None
def interpolate(point1, point2, y):
x1, y1 = point1.x, point1.y
x2, y2 = point2.x, point2.y
if y1 == y2:
raise ValueError('The y coordinate for points is the same {}, {}'.format(point1, point2))
x = int((y - y1) * (x2 - x1) / (y2 - y1) + x1)
return x
def bound(x, bounds):
x1, x2 = bounds
if x1 > x2:
x1, x2 = x2, x1
if x <= x1:
return x1 + 1
elif x >= x2:
return x2 - 1
else:
return x
class OutOfBoundsError(Exception):
pass
class Point(object):
def __init__(self, x, y):
self.x = x
self.y = y
def __str__(self):
return '({},{})'.format(self.x, self.y)

View File

@@ -1,3 +1,6 @@
import itertools
def hex_to_dec(hex_string):
if hex_string is None:
return None
@@ -44,3 +47,10 @@ def dynamic_batch_iterator(iterable, batch_size_getter):
batch_size = batch_size_getter()
if len(batch) > 0:
yield batch
def pairwise(iterable):
"""s -> (s0,s1), (s1,s2), (s2, s3), ..."""
a, b = itertools.tee(iterable)
next(b, None)
return zip(a, b)

15
ethereumetl/web3_utils.py Normal file
View File

@@ -0,0 +1,15 @@
from urllib.parse import urlparse
from web3 import IPCProvider, HTTPProvider
DEFAULT_IPC_TIMEOUT = 60
def get_provider_from_uri(uri_string):
uri = urlparse(uri_string)
if uri.scheme == 'file':
return IPCProvider(uri.path, timeout=DEFAULT_IPC_TIMEOUT)
elif uri.scheme == 'http' or uri.scheme == 'https':
return HTTPProvider(uri_string)
else:
raise ValueError('Unknown uri scheme {}'.format(uri_string))

View File

@@ -5,7 +5,7 @@ from ethereumetl.csv_utils import set_max_field_size_limit
from ethereumetl.file_utils import smart_open
parser = argparse.ArgumentParser(description='Extracts a single column from a given csv file.')
parser.add_argument('-i', '--input', default=None, type=str, help='The input file. If not specified stdin is used.')
parser.add_argument('-i', '--input', default='-', type=str, help='The input file. If not specified stdin is used.')
parser.add_argument('-o', '--output', default='-', type=str, help='The output file. If not specified stdout is used.')
parser.add_argument('-c', '--column', required=True, type=str, help='The csv column name to extract.')

View File

@@ -0,0 +1,26 @@
import argparse
from datetime import datetime
from web3 import Web3
from ethereumetl.file_utils import smart_open
from ethereumetl.service.eth_service import EthService
from ethereumetl.web3_utils import get_provider_from_uri
parser = argparse.ArgumentParser(description='Outputs the start block and end block for a given date.')
parser.add_argument('-p', '--provider-uri', default=None, type=str,
help='The URI of the web3 provider e.g. file://$HOME/Library/Ethereum/geth.ipc. or https://mainnet.infura.io/')
parser.add_argument('-o', '--output', default='-', type=str, help='The output file. If not specified stdout is used.')
parser.add_argument('-d', '--date', required=True, type=lambda d: datetime.strptime(d, '%Y-%m-%d'),
help='The date e.g. 2018-01-01.')
args = parser.parse_args()
provider = get_provider_from_uri(args.provider_uri)
web3 = Web3(provider)
eth_service = EthService(web3)
start_block, end_block = eth_service.get_block_range_for_date(args.date)
with smart_open(args.output, 'w') as output_file:
output_file.write('{},{}'.format(start_block, end_block))

View File

@@ -1,2 +1,3 @@
web3~=4.3.0
pytest~=3.2.0
python-dateutil~=2.7.0

View File

@@ -0,0 +1,64 @@
import os
import pytest
from dateutil.parser import parse
from web3 import HTTPProvider, Web3
from ethereumetl.service.eth_service import EthService
from ethereumetl.service.graph_operations import OutOfBoundsError
run_slow_tests = os.environ.get('ETHEREUM_ETL_RUN_SLOW_TESTS', None) == '1'
skip_slow_tests = pytest.mark.skipif(not run_slow_tests, reason='Slow running tests')
@skip_slow_tests
@pytest.mark.parametrize("date,expected_start_block,expected_end_block", [
('2015-07-30', 0, 6911),
('2015-07-31', 6912, 13774),
('2017-01-01', 2912407, 2918517),
('2017-01-02', 2918518, 2924575),
('2018-06-10', 5761663, 5767303)
])
def test_get_block_range_for_date(date, expected_start_block, expected_end_block):
eth_service = get_new_eth_service()
parsed_date = parse(date)
blocks = eth_service.get_block_range_for_date(parsed_date)
assert blocks == (expected_start_block, expected_end_block)
@skip_slow_tests
@pytest.mark.parametrize("date", [
'2015-07-29',
'2030-01-01'
])
def test_get_block_range_for_date_fail(date):
eth_service = get_new_eth_service()
parsed_date = parse(date)
with pytest.raises(OutOfBoundsError):
eth_service.get_block_range_for_date(parsed_date)
@skip_slow_tests
@pytest.mark.parametrize("start_timestamp,end_timestamp,expected_start_block,expected_end_block", [
(1438270128, 1438270128, 10, 10),
(1438270128, 1438270129, 10, 10)
])
def test_get_block_range_for_timestamps(start_timestamp, end_timestamp, expected_start_block, expected_end_block):
eth_service = get_new_eth_service()
blocks = eth_service.get_block_range_for_timestamps(start_timestamp, end_timestamp)
assert blocks == (expected_start_block, expected_end_block)
@skip_slow_tests
@pytest.mark.parametrize("start_timestamp,end_timestamp", [
(1438270129, 1438270131)
])
def test_get_block_range_for_timestamps_fail(start_timestamp, end_timestamp):
eth_service = get_new_eth_service()
with pytest.raises(ValueError):
eth_service.get_block_range_for_timestamps(start_timestamp, end_timestamp)
def get_new_eth_service():
web3 = Web3(HTTPProvider('https://mainnet.infura.io/'))
return EthService(web3)