Update track generation

This commit is contained in:
Eugene Burmakin
2025-07-23 18:21:21 +02:00
parent 4044e77fcd
commit d8033a1e27
17 changed files with 8103 additions and 29 deletions

View File

@@ -52,6 +52,7 @@ gem 'jwt'
group :development, :test do
gem 'brakeman', require: false
gem 'bullet'
gem 'bundler-audit', require: false
gem 'debug', platforms: %i[mri mingw x64_mingw]
gem 'dotenv-rails'
@@ -78,3 +79,9 @@ group :development do
gem 'foreman'
gem 'rubocop-rails', require: false
end
# group :production do
# gem 'uglifier'
# end
# gem 'sassc-rails'

View File

@@ -113,6 +113,9 @@ GEM
brakeman (7.0.2)
racc
builder (3.3.0)
bullet (8.0.8)
activesupport (>= 3.0.0)
uniform_notifier (~> 1.11)
bundler-audit (0.9.2)
bundler (>= 1.2.0, < 3)
thor (~> 1.0)
@@ -486,6 +489,7 @@ GEM
unicode-display_width (3.1.4)
unicode-emoji (~> 4.0, >= 4.0.4)
unicode-emoji (4.0.4)
uniform_notifier (1.17.0)
uri (1.0.3)
useragent (0.16.11)
warden (1.2.9)
@@ -519,6 +523,7 @@ DEPENDENCIES
aws-sdk-s3 (~> 1.177.0)
bootsnap
brakeman
bullet
bundler-audit
capybara
chartkick

File diff suppressed because one or more lines are too long

View File

@@ -50,16 +50,43 @@ module Distanceable
return 0 if points.length < 2
total_meters = points.each_cons(2).sum do |point1, point2|
connection.select_value(
'SELECT ST_Distance(ST_GeomFromEWKT($1)::geography, ST_GeomFromEWKT($2)::geography)',
nil,
[point1.lonlat, point2.lonlat]
)
end
# OPTIMIZED: Single SQL query instead of N individual queries
total_meters = calculate_batch_distances(points).sum
total_meters.to_f / ::DISTANCE_UNITS[unit.to_sym]
end
# Optimized batch distance calculation using single SQL query
def calculate_batch_distances(points)
return [] if points.length < 2
point_pairs = points.each_cons(2).to_a
return [] if point_pairs.empty?
# Create a VALUES clause with all point pairs
values_clause = point_pairs.map.with_index do |(p1, p2), index|
"(#{index}, ST_GeomFromEWKT('#{p1.lonlat}')::geography, ST_GeomFromEWKT('#{p2.lonlat}')::geography)"
end.join(', ')
# Single query to calculate all distances
results = connection.execute(<<-SQL.squish)
WITH point_pairs AS (
SELECT
pair_id,
point1,
point2
FROM (VALUES #{values_clause}) AS t(pair_id, point1, point2)
)
SELECT
pair_id,
ST_Distance(point1, point2) as distance_meters
FROM point_pairs
ORDER BY pair_id
SQL
# Return array of distances in meters
results.map { |row| row['distance_meters'].to_f }
end
end
def distance_to(other_point, unit = :km)

View File

@@ -25,6 +25,112 @@ class Track < ApplicationRecord
.first
end
# Optimized SQL segmentation using PostgreSQL window functions
def self.segment_points_in_sql(user_id, start_timestamp, end_timestamp, time_threshold_minutes, distance_threshold_meters, untracked_only: false)
time_threshold_seconds = time_threshold_minutes * 60
where_clause = if untracked_only
"WHERE user_id = $1 AND timestamp BETWEEN $2 AND $3 AND track_id IS NULL"
else
"WHERE user_id = $1 AND timestamp BETWEEN $2 AND $3"
end
sql = <<~SQL
WITH points_with_gaps AS (
SELECT
id,
timestamp,
lonlat,
LAG(lonlat) OVER (ORDER BY timestamp) as prev_lonlat,
LAG(timestamp) OVER (ORDER BY timestamp) as prev_timestamp,
ST_Distance(
lonlat::geography,
LAG(lonlat) OVER (ORDER BY timestamp)::geography
) as distance_meters,
(timestamp - LAG(timestamp) OVER (ORDER BY timestamp)) as time_diff_seconds
FROM points
#{where_clause}
ORDER BY timestamp
),
segment_breaks AS (
SELECT *,
CASE
WHEN prev_lonlat IS NULL THEN 1
WHEN time_diff_seconds > $4 THEN 1
WHEN distance_meters > $5 THEN 1
ELSE 0
END as is_break
FROM points_with_gaps
),
segments AS (
SELECT *,
SUM(is_break) OVER (ORDER BY timestamp ROWS UNBOUNDED PRECEDING) as segment_id
FROM segment_breaks
)
SELECT
segment_id,
array_agg(id ORDER BY timestamp) as point_ids,
count(*) as point_count,
min(timestamp) as start_timestamp,
max(timestamp) as end_timestamp,
sum(COALESCE(distance_meters, 0)) as total_distance_meters
FROM segments
GROUP BY segment_id
HAVING count(*) >= 2
ORDER BY segment_id
SQL
results = Point.connection.exec_query(
sql,
'segment_points_in_sql',
[user_id, start_timestamp, end_timestamp, time_threshold_seconds, distance_threshold_meters]
)
# Convert results to segment data
segments_data = []
results.each do |row|
segments_data << {
segment_id: row['segment_id'].to_i,
point_ids: parse_postgres_array(row['point_ids']),
point_count: row['point_count'].to_i,
start_timestamp: row['start_timestamp'].to_i,
end_timestamp: row['end_timestamp'].to_i,
total_distance_meters: row['total_distance_meters'].to_f
}
end
segments_data
end
# Get actual Point objects for each segment with pre-calculated distances
def self.get_segments_with_points(user_id, start_timestamp, end_timestamp, time_threshold_minutes, distance_threshold_meters, untracked_only: false)
segments_data = segment_points_in_sql(user_id, start_timestamp, end_timestamp, time_threshold_minutes, distance_threshold_meters, untracked_only: untracked_only)
# Get all point IDs we need
all_point_ids = segments_data.flat_map { |seg| seg[:point_ids] }
# Single query to get all points
points_by_id = Point.where(id: all_point_ids).index_by(&:id)
# Build segments with actual Point objects
segments_data.map do |seg_data|
{
points: seg_data[:point_ids].map { |id| points_by_id[id] }.compact,
pre_calculated_distance: seg_data[:total_distance_meters],
start_timestamp: seg_data[:start_timestamp],
end_timestamp: seg_data[:end_timestamp]
}
end
end
# Parse PostgreSQL array format like "{1,2,3}" into Ruby array
def self.parse_postgres_array(pg_array_string)
return [] if pg_array_string.nil? || pg_array_string.empty?
# Remove curly braces and split by comma
pg_array_string.gsub(/[{}]/, '').split(',').map(&:to_i)
end
private
def broadcast_track_created

View File

@@ -40,21 +40,32 @@ class Tracks::Generator
def call
clean_existing_tracks if should_clean_tracks?
points = load_points
Rails.logger.debug "Generator: loaded #{points.size} points for user #{user.id} in #{mode} mode"
return 0 if points.empty?
# Get timestamp range for SQL query
start_timestamp, end_timestamp = get_timestamp_range
Rails.logger.debug "Generator: querying points for user #{user.id} in #{mode} mode"
# Use optimized SQL segmentation with pre-calculated distances
untracked_only = (mode == :incremental)
segments = Track.get_segments_with_points(
user.id,
start_timestamp,
end_timestamp,
time_threshold_minutes,
distance_threshold_meters,
untracked_only: untracked_only
)
segments = split_points_into_segments(points)
Rails.logger.debug "Generator: created #{segments.size} segments"
Rails.logger.debug "Generator: created #{segments.size} segments via SQL"
tracks_created = 0
segments.each do |segment|
track = create_track_from_segment(segment)
segments.each do |segment_data|
track = create_track_from_segment_optimized(segment_data)
tracks_created += 1 if track
end
Rails.logger.info "Generated #{tracks_created} tracks for user #{user.id} in #{mode} mode"
Rails.logger.info "Generated #{tracks_created} tracks for user #{user.id} in optimized #{mode} mode"
tracks_created
end
@@ -99,6 +110,18 @@ class Tracks::Generator
user.tracked_points.where(timestamp: day_range).order(:timestamp)
end
def create_track_from_segment_optimized(segment_data)
points = segment_data[:points]
pre_calculated_distance = segment_data[:pre_calculated_distance]
Rails.logger.debug "Generator: processing segment with #{points.size} points"
return unless points.size >= 2
track = create_track_from_points_optimized(points, pre_calculated_distance)
Rails.logger.debug "Generator: created track #{track&.id}"
track
end
def create_track_from_segment(segment)
Rails.logger.debug "Generator: processing segment with #{segment.size} points"
return unless segment.size >= 2
@@ -171,6 +194,31 @@ class Tracks::Generator
scope.destroy_all
end
# Get timestamp range for SQL query based on mode
def get_timestamp_range
case mode
when :bulk
if start_at && end_at
[start_at.to_i, end_at.to_i]
else
# Get full range for user
first_point = user.tracked_points.order(:timestamp).first
last_point = user.tracked_points.order(:timestamp).last
[first_point&.timestamp || 0, last_point&.timestamp || Time.current.to_i]
end
when :daily
day = start_at&.to_date || Date.current
[day.beginning_of_day.to_i, day.end_of_day.to_i]
when :incremental
# For incremental, we need all untracked points up to end_at
first_point = user.tracked_points.where(track_id: nil).order(:timestamp).first
end_timestamp = end_at ? end_at.to_i : Time.current.to_i
[first_point&.timestamp || 0, end_timestamp]
else
raise ArgumentError, "Unknown mode: #{mode}"
end
end
# Threshold methods from safe_settings
def distance_threshold_meters
@distance_threshold_meters ||= user.safe_settings.meters_between_routes.to_i

View File

@@ -86,11 +86,15 @@ module Tracks::Segmentation
end
def calculate_km_distance_between_points(point1, point2)
lat1, lon1 = point_coordinates(point1)
lat2, lon2 = point_coordinates(point2)
# Use Geocoder to match behavior with frontend (same library used elsewhere in app)
Geocoder::Calculations.distance_between([lat1, lon1], [lat2, lon2], units: :km)
# OPTIMIZED: Use PostGIS for more accurate distance calculation (same as track distance)
# This maintains consistency with track distance calculations
distance_meters = Point.connection.select_value(
'SELECT ST_Distance(ST_GeomFromEWKT($1)::geography, ST_GeomFromEWKT($2)::geography)',
nil,
[point1.lonlat, point2.lonlat]
)
distance_meters.to_f / 1000.0 # Convert meters to kilometers
end
def should_finalize_segment?(segment_points, grace_period_minutes = 5)

View File

@@ -82,6 +82,38 @@ module Tracks::TrackBuilder
end
end
# Optimized version that uses pre-calculated distance from SQL
def create_track_from_points_optimized(points, pre_calculated_distance)
return nil if points.size < 2
track = Track.new(
user_id: user.id,
start_at: Time.zone.at(points.first.timestamp),
end_at: Time.zone.at(points.last.timestamp),
original_path: build_path(points)
)
# Use pre-calculated distance from SQL instead of recalculating
track.distance = pre_calculated_distance.round
track.duration = calculate_duration(points)
track.avg_speed = calculate_average_speed(track.distance, track.duration)
# Calculate elevation statistics (no DB queries needed)
elevation_stats = calculate_elevation_stats(points)
track.elevation_gain = elevation_stats[:gain]
track.elevation_loss = elevation_stats[:loss]
track.elevation_max = elevation_stats[:max]
track.elevation_min = elevation_stats[:min]
if track.save
Point.where(id: points.map(&:id)).update_all(track_id: track.id)
track
else
Rails.logger.error "Failed to create track for user #{user.id}: #{track.errors.full_messages.join(', ')}"
nil
end
end
def build_path(points)
Tracks::BuildPath.new(points).call
end

View File

@@ -21,7 +21,9 @@
<% end %>
</div>
<%= link_to 'Update stats', update_all_stats_path, data: { turbo_method: :put }, class: 'btn btn-primary mt-5' %>
<% if current_user.active? %>
<%= link_to 'Update stats', update_all_stats_path, data: { turbo_method: :put }, class: 'btn btn-primary mt-5' %>
<% end %>
<div class="mt-6 grid grid-cols-1 sm:grid-cols-1 md:grid-cols-2 lg:grid-cols-2 gap-6">
<% @stats.each do |year, stats| %>
@@ -33,7 +35,7 @@
<%= link_to '[Map]', map_url(year_timespan(year)), class: 'underline hover:no-underline' %>
</div>
<div class="gap-2">
<span class='text-xs text-gray-500'>Last updated: <%= human_date(stats.first.updated_at) %></span>
<span class='text-xs text-gray-500'>Last update: <%= human_date(stats.first.updated_at) %></span>
<%= link_to '🔄', update_year_month_stats_path(year, :all), data: { turbo_method: :put }, class: 'text-sm text-gray-500 hover:underline' %>
</div>
</h2>

View File

@@ -3,6 +3,17 @@
require 'active_support/core_ext/integer/time'
Rails.application.configure do
unless ENV['SELF_HOSTED'] == 'true'
config.after_initialize do
Bullet.enable = true
Bullet.alert = true
Bullet.bullet_logger = true
Bullet.console = true
Bullet.rails_logger = true
Bullet.add_footer = true
end
end
# Settings specified here will take precedence over those in config/application.rb.
# In the development environment your application's code is reloaded any time

View File

@@ -8,6 +8,12 @@ require 'active_support/core_ext/integer/time'
# and recreated between test runs. Don't rely on the data there!
Rails.application.configure do
config.after_initialize do
Bullet.enable = true
Bullet.bullet_logger = true
Bullet.raise = true # raise an error if n+1 query occurs
end
# Settings specified here will take precedence over those in config/application.rb.
# While tests run files are not watched, reloading is not necessary.

145
lib/optimized_tracks_v1.rb Normal file
View File

@@ -0,0 +1,145 @@
# frozen_string_literal: true
# Optimization V1: LAG-based distance calculation with Ruby segmentation
# This keeps the existing Ruby segmentation logic but uses PostgreSQL LAG
# for batch distance calculations instead of individual queries
module OptimizedTracksV1
extend ActiveSupport::Concern
module ClassMethods
# V1: Use LAG to get all consecutive distances in a single query
def calculate_all_consecutive_distances(points)
return [] if points.length < 2
point_ids = points.map(&:id).join(',')
results = connection.execute(<<-SQL.squish)
WITH points_with_previous AS (
SELECT
id,
timestamp,
lonlat,
LAG(lonlat) OVER (ORDER BY timestamp) as prev_lonlat,
LAG(timestamp) OVER (ORDER BY timestamp) as prev_timestamp,
LAG(id) OVER (ORDER BY timestamp) as prev_id
FROM points
WHERE id IN (#{point_ids})
)
SELECT
id,
prev_id,
timestamp,
prev_timestamp,
ST_Distance(lonlat::geography, prev_lonlat::geography) as distance_meters,
(timestamp - prev_timestamp) as time_diff_seconds
FROM points_with_previous
WHERE prev_lonlat IS NOT NULL
ORDER BY timestamp
SQL
# Return hash mapping point_id => {distance_to_previous, time_diff}
distance_map = {}
results.each do |row|
distance_map[row['id'].to_i] = {
distance_meters: row['distance_meters'].to_f,
time_diff_seconds: row['time_diff_seconds'].to_i,
prev_id: row['prev_id'].to_i
}
end
distance_map
end
# V1: Optimized total distance using LAG (already exists in distanceable.rb)
def total_distance_lag(points, unit = :m)
unless ::DISTANCE_UNITS.key?(unit.to_sym)
raise ArgumentError, "Invalid unit. Supported units are: #{::DISTANCE_UNITS.keys.join(', ')}"
end
return 0 if points.length < 2
point_ids = points.map(&:id).join(',')
distance_in_meters = connection.select_value(<<-SQL.squish)
WITH points_with_previous AS (
SELECT
lonlat,
LAG(lonlat) OVER (ORDER BY timestamp) as prev_lonlat
FROM points
WHERE id IN (#{point_ids})
)
SELECT COALESCE(
SUM(ST_Distance(lonlat::geography, prev_lonlat::geography)),
0
)
FROM points_with_previous
WHERE prev_lonlat IS NOT NULL
SQL
distance_in_meters.to_f / ::DISTANCE_UNITS[unit.to_sym]
end
end
end
# Optimized segmentation module using pre-calculated distances
module OptimizedSegmentationV1
extend ActiveSupport::Concern
private
def split_points_into_segments_v1(points)
return [] if points.empty?
# V1: Pre-calculate all distances and time diffs in one query
if points.size > 1
distance_data = Point.calculate_all_consecutive_distances(points)
else
distance_data = {}
end
segments = []
current_segment = []
points.each do |point|
if current_segment.empty?
# First point always starts a segment
current_segment = [point]
elsif should_start_new_segment_v1?(point, current_segment.last, distance_data)
# Finalize current segment if it has enough points
segments << current_segment if current_segment.size >= 2
current_segment = [point]
else
current_segment << point
end
end
# Don't forget the last segment
segments << current_segment if current_segment.size >= 2
segments
end
def should_start_new_segment_v1?(current_point, previous_point, distance_data)
return false if previous_point.nil?
# Get pre-calculated data for this point
point_data = distance_data[current_point.id]
return false unless point_data
# Check time threshold
time_threshold_seconds = time_threshold_minutes.to_i * 60
return true if point_data[:time_diff_seconds] > time_threshold_seconds
# Check distance threshold
distance_meters = point_data[:distance_meters]
return true if distance_meters > distance_threshold_meters
false
end
end
# Add methods to Point class
class Point
extend OptimizedTracksV1::ClassMethods
end

291
lib/optimized_tracks_v2.rb Normal file
View File

@@ -0,0 +1,291 @@
# frozen_string_literal: true
# Optimization V2: Full SQL segmentation using PostgreSQL window functions
# This does both distance calculation AND segmentation entirely in SQL
module OptimizedTracksV2
extend ActiveSupport::Concern
module ClassMethods
# V2: Complete segmentation in SQL using LAG and window functions
def segment_points_in_sql(user_id, start_timestamp, end_timestamp, time_threshold_minutes, distance_threshold_meters)
time_threshold_seconds = time_threshold_minutes * 60
sql = <<~SQL
WITH points_with_gaps AS (
SELECT
id,
timestamp,
lonlat,
LAG(lonlat) OVER (ORDER BY timestamp) as prev_lonlat,
LAG(timestamp) OVER (ORDER BY timestamp) as prev_timestamp,
ST_Distance(
lonlat::geography,
LAG(lonlat) OVER (ORDER BY timestamp)::geography
) as distance_meters,
(timestamp - LAG(timestamp) OVER (ORDER BY timestamp)) as time_diff_seconds
FROM points
WHERE user_id = $1
AND timestamp BETWEEN $2 AND $3
ORDER BY timestamp
),
segment_breaks AS (
SELECT *,
CASE
WHEN prev_lonlat IS NULL THEN 1
WHEN time_diff_seconds > $4 THEN 1
WHEN distance_meters > $5 THEN 1
ELSE 0
END as is_break
FROM points_with_gaps
),
segments AS (
SELECT *,
SUM(is_break) OVER (ORDER BY timestamp ROWS UNBOUNDED PRECEDING) as segment_id
FROM segment_breaks
)
SELECT
segment_id,
array_agg(id ORDER BY timestamp) as point_ids,
count(*) as point_count,
min(timestamp) as start_timestamp,
max(timestamp) as end_timestamp,
sum(COALESCE(distance_meters, 0)) as total_distance_meters
FROM segments
GROUP BY segment_id
HAVING count(*) >= 2
ORDER BY segment_id
SQL
results = connection.exec_query(
sql,
'segment_points_in_sql',
[user_id, start_timestamp, end_timestamp, time_threshold_seconds, distance_threshold_meters]
)
# Convert results to segment data
segments_data = []
results.each do |row|
segments_data << {
segment_id: row['segment_id'].to_i,
point_ids: parse_postgres_array(row['point_ids']),
point_count: row['point_count'].to_i,
start_timestamp: row['start_timestamp'].to_i,
end_timestamp: row['end_timestamp'].to_i,
total_distance_meters: row['total_distance_meters'].to_f
}
end
segments_data
end
# V2: Get actual Point objects for each segment
def get_segments_with_points(user_id, start_timestamp, end_timestamp, time_threshold_minutes, distance_threshold_meters)
segments_data = segment_points_in_sql(user_id, start_timestamp, end_timestamp, time_threshold_minutes, distance_threshold_meters)
# Get all point IDs we need
all_point_ids = segments_data.flat_map { |seg| seg[:point_ids] }
# Single query to get all points
points_by_id = Point.where(id: all_point_ids).index_by(&:id)
# Build segments with actual Point objects
segments_data.map do |seg_data|
{
points: seg_data[:point_ids].map { |id| points_by_id[id] }.compact,
pre_calculated_distance: seg_data[:total_distance_meters],
start_timestamp: seg_data[:start_timestamp],
end_timestamp: seg_data[:end_timestamp]
}
end
end
private
# Parse PostgreSQL array format like "{1,2,3}" into Ruby array
def parse_postgres_array(pg_array_string)
return [] if pg_array_string.nil? || pg_array_string.empty?
# Remove curly braces and split by comma
pg_array_string.gsub(/[{}]/, '').split(',').map(&:to_i)
end
end
end
# Optimized generator using V2 SQL segmentation
class OptimizedTracksGeneratorV2
attr_reader :user, :start_at, :end_at, :mode
def initialize(user, start_at: nil, end_at: nil, mode: :bulk)
@user = user
@start_at = start_at
@end_at = end_at
@mode = mode.to_sym
end
def call
clean_existing_tracks if should_clean_tracks?
# Get timestamp range for SQL query
start_timestamp, end_timestamp = get_timestamp_range
Rails.logger.debug "OptimizedGeneratorV2: querying points for user #{user.id} in #{mode} mode"
# V2: Get segments directly from SQL with pre-calculated distances
segments = Point.get_segments_with_points(
user.id,
start_timestamp,
end_timestamp,
time_threshold_minutes,
distance_threshold_meters
)
Rails.logger.debug "OptimizedGeneratorV2: created #{segments.size} segments via SQL"
tracks_created = 0
segments.each do |segment_data|
track = create_track_from_segment_v2(segment_data)
tracks_created += 1 if track
end
Rails.logger.info "Generated #{tracks_created} tracks for user #{user.id} in optimized V2 #{mode} mode"
tracks_created
end
private
def create_track_from_segment_v2(segment_data)
points = segment_data[:points]
pre_calculated_distance = segment_data[:pre_calculated_distance]
Rails.logger.debug "OptimizedGeneratorV2: processing segment with #{points.size} points"
return unless points.size >= 2
track = Track.new(
user_id: user.id,
start_at: Time.zone.at(points.first.timestamp),
end_at: Time.zone.at(points.last.timestamp),
original_path: build_path(points)
)
# V2: Use pre-calculated distance from SQL
track.distance = pre_calculated_distance.round
track.duration = calculate_duration(points)
track.avg_speed = calculate_average_speed(track.distance, track.duration)
# Calculate elevation statistics (no DB queries needed)
elevation_stats = calculate_elevation_stats(points)
track.elevation_gain = elevation_stats[:gain]
track.elevation_loss = elevation_stats[:loss]
track.elevation_max = elevation_stats[:max]
track.elevation_min = elevation_stats[:min]
if track.save
Point.where(id: points.map(&:id)).update_all(track_id: track.id)
track
else
Rails.logger.error "Failed to create track for user #{user.id}: #{track.errors.full_messages.join(', ')}"
nil
end
end
def get_timestamp_range
case mode
when :bulk
if start_at && end_at
[start_at.to_i, end_at.to_i]
else
# Get full range for user
first_point = user.tracked_points.order(:timestamp).first
last_point = user.tracked_points.order(:timestamp).last
[first_point&.timestamp || 0, last_point&.timestamp || Time.current.to_i]
end
when :daily
day = start_at&.to_date || Date.current
[day.beginning_of_day.to_i, day.end_of_day.to_i]
when :incremental
# For incremental, we need all untracked points up to end_at
first_point = user.tracked_points.where(track_id: nil).order(:timestamp).first
end_timestamp = end_at ? end_at.to_i : Time.current.to_i
[first_point&.timestamp || 0, end_timestamp]
end
end
def should_clean_tracks?
case mode
when :bulk, :daily then true
else false
end
end
def clean_existing_tracks
case mode
when :bulk
scope = user.tracks
if start_at && end_at
scope = scope.where(start_at: start_at..end_at)
end
scope.destroy_all
when :daily
day = start_at&.to_date || Date.current
range = day.beginning_of_day..day.end_of_day
user.tracks.where(start_at: range).destroy_all
end
end
# Helper methods (same as original)
def build_path(points)
Tracks::BuildPath.new(points).call
end
def calculate_duration(points)
points.last.timestamp - points.first.timestamp
end
def calculate_average_speed(distance_in_meters, duration_seconds)
return 0.0 if duration_seconds <= 0 || distance_in_meters <= 0
speed_mps = distance_in_meters.to_f / duration_seconds
(speed_mps * 3.6).round(2) # m/s to km/h
end
def calculate_elevation_stats(points)
altitudes = points.map(&:altitude).compact
return { gain: 0, loss: 0, max: 0, min: 0 } if altitudes.empty?
elevation_gain = 0
elevation_loss = 0
previous_altitude = altitudes.first
altitudes[1..].each do |altitude|
diff = altitude - previous_altitude
if diff > 0
elevation_gain += diff
else
elevation_loss += diff.abs
end
previous_altitude = altitude
end
{
gain: elevation_gain.round,
loss: elevation_loss.round,
max: altitudes.max,
min: altitudes.min
}
end
def distance_threshold_meters
@distance_threshold_meters ||= user.safe_settings.meters_between_routes.to_i
end
def time_threshold_minutes
@time_threshold_minutes ||= user.safe_settings.minutes_between_routes.to_i
end
end
# Add methods to Point class
class Point
extend OptimizedTracksV2::ClassMethods
end

122
lib/results.md Normal file
View File

@@ -0,0 +1,122 @@
## Original
Generator: created track 227296
Generated 1437 tracks for user 1 in bulk mode
✅ Generation completed successfully
============================================================
📊 BENCHMARK RESULTS
============================================================
Status: ✅ SUCCESS
Execution Time: 1m 28.5s
Tracks Created: 1437
Timeframe Coverage: 8.0% of user's total data
💾 Memory Usage:
Start: 210.9MB
End: 433.2MB
Memory Increase: +222.3MB
🗄️ Database Performance:
Total Queries: 115920
Total Query Time: 50453.1ms
Average Query Time: 0.44ms
Slow Queries (>100ms): 63
1. 983.24ms - SELECT COUNT(*) FROM "points" WHERE "points"."user_id" = $1 AND "points"."timestamp" BETWEEN $2 A...
2. 2826.02ms - SELECT "points".* FROM "points" WHERE "points"."user_id" = $1 AND "points"."timestamp" BETWEEN $2...
3. 217.02ms - UPDATE "points" SET "track_id" = $1 WHERE "points"."id" IN ($2, $3, $4, $5, $6, $7, $8, $9, $10, ...
✔️ Post-Generation Validation:
Points in Timeframe: 111609
Points with Tracks: 110167
Points without Tracks: 1442
Track Records: 1437
✅ Data integrity: PASS
🔍 Performance Analysis:
Speed Rating: 🚀 Excellent (1m 28.5s)
Memory Rating: 🧡 High (433.2MB peak)
Recommendation: Consider database optimization or smaller batch sizes
🔮 Extrapolation for Full Dataset:
Full Dataset Size: 1,403,662 points
Scaling Factor: 12.6x
Estimated Full Time: 18m 32.8s
Estimated Full Memory: 5447.6MB
============================================================
📋 BENCHMARK SUMMARY
============================================================
⏱️ Total Time: 1m 28.5s
📍 Points Processed: 111,609
🛤️ Tracks Created: 1437
🚀 Processing Speed: 1261.4 points/second
📅 Timeframe: 2024-01-01 to 2024-12-31
👤 User: demo@dawarich.app (ID: 1)
✅ Status: COMPLETED
## Iteration 1
Generator: created track 244784
Generated 1435 tracks for user 1 in optimized bulk mode
✅ Generation completed successfully
============================================================
📊 BENCHMARK RESULTS
============================================================
Status: ✅ SUCCESS
Execution Time: 56.4s
Tracks Created: 1435
Points Processed: 111,609
Processing Speed: 1978.3 points/second
Average Points/Track: 77.8
Timeframe Coverage: 8.0% of user's total data
💾 Memory Usage:
Start: 297.2MB
End: 407.5MB
Memory Increase: +110.3MB
🗄️ Database Performance:
Total Queries: 7178
Total Query Time: 44521.33ms
Average Query Time: 6.2ms
Slow Queries (>100ms): 88
1. 2338.43ms - WITH points_with_gaps AS (
SELECT
id,
timestamp,
lonlat,
LAG(lonlat) OVER (ORDE...
2. 4156.84ms - SELECT "points".* FROM "points" WHERE "points"."id" IN (2163775, 2163776, 2163777, 2163778, 21637...
3. 298.62ms - UPDATE "points" SET "track_id" = $1 WHERE "points"."id" IN ($2, $3, $4, $5, $6, $7, $8, $9, $10, ...
✔️ Post-Generation Validation:
Points in Timeframe: 111609
Points with Tracks: 110123
Points without Tracks: 1486
Track Records: 1435
✅ Data integrity: PASS
🔍 Performance Analysis:
Speed Rating: 🚀 Excellent (56.4s)
Memory Rating: 🧡 High (407.5MB peak)
Recommendation: Consider database optimization or smaller batch sizes
🔮 Extrapolation for Full Dataset:
Full Dataset Size: 1,403,662 points
Scaling Factor: 12.6x
Estimated Full Time: 11m 49.5s
Estimated Full Memory: 5125.0MB
============================================================
📋 BENCHMARK SUMMARY
============================================================
⏱️ Total Time: 56.4s
📍 Points Processed: 111,609
🛤️ Tracks Created: 1435
🚀 Processing Speed: 1978.3 points/second
📅 Timeframe: 2024-01-01 to 2024-12-31
👤 User: demo@dawarich.app (ID: 1)
✅ Status: COMPLETED

View File

@@ -1,7 +1,6 @@
# frozen_string_literal: true
module Timestamps
def self.parse_timestamp(timestamp)
begin
# if the timestamp is in ISO 8601 format, try to parse it

View File

@@ -0,0 +1,625 @@
# frozen_string_literal: true
require_relative 'optimized_tracks_v1'
require_relative 'optimized_tracks_v2'
# Benchmark script to compare three different track generation approaches:
# - Original: Individual distance queries (current implementation)
# - V1: LAG-based distance pre-calculation with Ruby segmentation
# - V2: Full SQL segmentation with PostgreSQL window functions
#
# Usage:
# rails runner lib/tracks_optimization_benchmark.rb USER_ID START_DATE END_DATE
class TracksOptimizationBenchmark
attr_reader :user, :start_date, :end_date, :start_timestamp, :end_timestamp
def initialize(user_id, start_date, end_date)
@user = User.find(user_id)
@start_date = Date.parse(start_date)
@end_date = Date.parse(end_date)
@start_timestamp = @start_date.beginning_of_day.to_i
@end_timestamp = @end_date.end_of_day.to_i
puts "🔬 Track Generation Optimization Benchmark"
puts "👤 User: #{user.email} (ID: #{user.id})"
puts "📅 Timeframe: #{start_date} to #{end_date}"
check_data_availability
end
def run_all_benchmarks
results = {}
puts "\n" + "=" * 80
puts "🏃 RUNNING ALL BENCHMARKS"
puts "=" * 80
# Test Original approach
puts "\n1⃣ Testing ORIGINAL approach..."
results[:original] = benchmark_original
# Test V1 approach
puts "\n2⃣ Testing V1 (LAG + Ruby) approach..."
results[:v1] = benchmark_v1
# Test V2 approach
puts "\n3⃣ Testing V2 (Full SQL) approach..."
results[:v2] = benchmark_v2
# Compare results
puts "\n" + "=" * 80
puts "📊 PERFORMANCE COMPARISON"
puts "=" * 80
compare_results(results)
# Save results to files
save_results_to_files(results)
results
end
private
def check_data_availability
point_count = user.tracked_points.where(timestamp: start_timestamp..end_timestamp).count
existing_tracks = user.tracks.where(start_at: Time.zone.at(start_timestamp)..Time.zone.at(end_timestamp)).count
puts "📊 Dataset: #{point_count.to_s.reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse} points"
puts "🛤️ Existing tracks: #{existing_tracks}"
if point_count == 0
puts "❌ No points found in timeframe"
exit 1
end
if point_count > 50000
puts "⚠️ Large dataset detected. This benchmark may take a while..."
end
end
def benchmark_original
puts " Using standard Tracks::Generator..."
# Clean existing tracks
cleanup_tracks
# Monitor performance
memory_start = get_memory_mb
query_monitor = QueryMonitor.new
query_monitor.start
start_time = Time.current
begin
generator = Tracks::Generator.new(
user,
start_at: Time.zone.at(start_timestamp),
end_at: Time.zone.at(end_timestamp),
mode: :bulk
)
tracks_created = generator.call
success = true
rescue => e
success = false
error = e.message
tracks_created = 0
end
end_time = Time.current
memory_end = get_memory_mb
query_monitor.stop
execution_time = end_time - start_time
result = {
approach: "Original",
success: success,
error: error,
execution_time: execution_time,
tracks_created: tracks_created,
memory_increase: memory_end - memory_start,
query_count: query_monitor.query_count,
query_time_ms: query_monitor.total_time_ms
}
print_result(result)
result
end
def benchmark_v1
puts " Using V1: LAG + Ruby segmentation..."
# Clean existing tracks
cleanup_tracks
# For V1, we need to modify the existing generator to use our optimized methods
# This is a simplified test - in practice we'd modify the actual generator
memory_start = get_memory_mb
query_monitor = QueryMonitor.new
query_monitor.start
start_time = Time.current
begin
# Load points
points = user.tracked_points
.where(timestamp: start_timestamp..end_timestamp)
.order(:timestamp)
# V1: Use optimized segmentation with pre-calculated distances
if points.size > 1
distance_data = Point.calculate_all_consecutive_distances(points)
else
distance_data = {}
end
# Segment using V1 approach (simplified for benchmark)
segments = split_points_with_precalculated_distances(points, distance_data)
tracks_created = 0
segments.each do |segment|
if segment.size >= 2
track = create_track_v1(segment)
tracks_created += 1 if track
end
end
success = true
rescue => e
success = false
error = e.message
tracks_created = 0
end
end_time = Time.current
memory_end = get_memory_mb
query_monitor.stop
execution_time = end_time - start_time
result = {
approach: "V1 (LAG + Ruby)",
success: success,
error: error,
execution_time: execution_time,
tracks_created: tracks_created,
memory_increase: memory_end - memory_start,
query_count: query_monitor.query_count,
query_time_ms: query_monitor.total_time_ms
}
print_result(result)
result
end
def benchmark_v2
puts " Using V2: Full SQL segmentation..."
cleanup_tracks
memory_start = get_memory_mb
query_monitor = QueryMonitor.new
query_monitor.start
start_time = Time.current
begin
generator = OptimizedTracksGeneratorV2.new(
user,
start_at: Time.zone.at(start_timestamp),
end_at: Time.zone.at(end_timestamp),
mode: :bulk
)
tracks_created = generator.call
success = true
rescue => e
success = false
error = e.message
tracks_created = 0
end
end_time = Time.current
memory_end = get_memory_mb
query_monitor.stop
execution_time = end_time - start_time
result = {
approach: "V2 (Full SQL)",
success: success,
error: error,
execution_time: execution_time,
tracks_created: tracks_created,
memory_increase: memory_end - memory_start,
query_count: query_monitor.query_count,
query_time_ms: query_monitor.total_time_ms
}
print_result(result)
result
end
def split_points_with_precalculated_distances(points, distance_data)
return [] if points.empty?
segments = []
current_segment = []
points.each do |point|
if current_segment.empty?
current_segment = [point]
elsif should_break_segment_v1?(point, current_segment.last, distance_data)
segments << current_segment if current_segment.size >= 2
current_segment = [point]
else
current_segment << point
end
end
segments << current_segment if current_segment.size >= 2
segments
end
def should_break_segment_v1?(current_point, previous_point, distance_data)
return false if previous_point.nil?
point_data = distance_data[current_point.id]
return false unless point_data
time_threshold_seconds = user.safe_settings.minutes_between_routes.to_i * 60
distance_threshold_meters = user.safe_settings.meters_between_routes.to_i
return true if point_data[:time_diff_seconds] > time_threshold_seconds
return true if point_data[:distance_meters] > distance_threshold_meters
false
end
def create_track_v1(points)
return nil if points.size < 2
track = Track.new(
user_id: user.id,
start_at: Time.zone.at(points.first.timestamp),
end_at: Time.zone.at(points.last.timestamp),
original_path: build_path(points)
)
# Use LAG-based distance calculation
track.distance = Point.total_distance_lag(points, :m).round
track.duration = points.last.timestamp - points.first.timestamp
track.avg_speed = calculate_average_speed(track.distance, track.duration)
# Elevation stats (same as original)
elevation_stats = calculate_elevation_stats(points)
track.elevation_gain = elevation_stats[:gain]
track.elevation_loss = elevation_stats[:loss]
track.elevation_max = elevation_stats[:max]
track.elevation_min = elevation_stats[:min]
if track.save
Point.where(id: points.map(&:id)).update_all(track_id: track.id)
track
else
nil
end
end
def cleanup_tracks
user.tracks.where(start_at: Time.zone.at(start_timestamp)..Time.zone.at(end_timestamp)).destroy_all
end
def print_result(result)
status = result[:success] ? "✅ SUCCESS" : "❌ FAILED"
puts " #{status}"
puts " ⏱️ Time: #{format_duration(result[:execution_time])}"
puts " 🛤️ Tracks: #{result[:tracks_created]}"
puts " 💾 Memory: +#{result[:memory_increase].round(1)}MB"
puts " 🗄️ Queries: #{result[:query_count]} (#{result[:query_time_ms].round(1)}ms)"
puts " ❌ Error: #{result[:error]}" if result[:error]
end
def compare_results(results)
return unless results[:original] && results[:v1] && results[:v2]
puts sprintf("%-20s %-10s %-12s %-10s %-15s %-10s",
"Approach", "Time", "Tracks", "Memory", "Queries", "Query Time")
puts "-" * 80
[:original, :v1, :v2].each do |approach|
result = results[approach]
next unless result[:success]
puts sprintf("%-20s %-10s %-12s %-10s %-15s %-10s",
result[:approach],
format_duration(result[:execution_time]),
result[:tracks_created],
"+#{result[:memory_increase].round(1)}MB",
result[:query_count],
"#{result[:query_time_ms].round(1)}ms")
end
# Calculate improvements
if results[:original][:success]
original_time = results[:original][:execution_time]
original_queries = results[:original][:query_count]
puts "\n🚀 Performance Improvements vs Original:"
if results[:v1][:success]
v1_speedup = (original_time / results[:v1][:execution_time]).round(2)
v1_query_reduction = ((original_queries - results[:v1][:query_count]) / original_queries.to_f * 100).round(1)
puts " V1: #{v1_speedup}x faster, #{v1_query_reduction}% fewer queries"
end
if results[:v2][:success]
v2_speedup = (original_time / results[:v2][:execution_time]).round(2)
v2_query_reduction = ((original_queries - results[:v2][:query_count]) / original_queries.to_f * 100).round(1)
puts " V2: #{v2_speedup}x faster, #{v2_query_reduction}% fewer queries"
end
end
end
def save_results_to_files(results)
timestamp = Time.current.strftime('%Y%m%d_%H%M%S')
point_count = user.tracked_points.where(timestamp: start_timestamp..end_timestamp).count
# Create detailed results structure
benchmark_data = {
meta: {
timestamp: Time.current.iso8601,
user_id: user.id,
user_email: user.email,
start_date: start_date.strftime('%Y-%m-%d'),
end_date: end_date.strftime('%Y-%m-%d'),
point_count: point_count,
ruby_version: RUBY_VERSION,
rails_version: Rails.version,
database_adapter: ActiveRecord::Base.connection.adapter_name
},
results: results,
performance_analysis: analyze_performance_data(results)
}
# Save JSON results for programmatic analysis
json_filename = "tracks_optimization_#{timestamp}.json"
json_path = Rails.root.join('lib', json_filename)
File.write(json_path, JSON.pretty_generate(benchmark_data))
# Save human-readable markdown report
md_filename = "tracks_optimization_#{timestamp}.md"
md_path = Rails.root.join('lib', md_filename)
File.write(md_path, generate_markdown_report(benchmark_data))
puts "\n💾 Results saved:"
puts " 📄 JSON: #{json_path}"
puts " 📝 Report: #{md_path}"
end
def analyze_performance_data(results)
return {} unless results[:original] && results[:original][:success]
original = results[:original]
analysis = {
baseline: {
execution_time: original[:execution_time],
query_count: original[:query_count],
memory_usage: original[:memory_increase]
}
}
[:v1, :v2].each do |version|
next unless results[version] && results[version][:success]
result = results[version]
analysis[version] = {
speedup_factor: (original[:execution_time] / result[:execution_time]).round(2),
query_reduction_percent: ((original[:query_count] - result[:query_count]) / original[:query_count].to_f * 100).round(1),
memory_change_percent: ((result[:memory_increase] - original[:memory_increase]) / original[:memory_increase].to_f * 100).round(1),
execution_time_saved: (original[:execution_time] - result[:execution_time]).round(2)
}
end
analysis
end
def generate_markdown_report(benchmark_data)
meta = benchmark_data[:meta]
results = benchmark_data[:results]
analysis = benchmark_data[:performance_analysis]
report = <<~MD
# Tracks Generation Optimization Benchmark Report
**Generated:** #{meta[:timestamp]}
**User:** #{meta[:user_email]} (ID: #{meta[:user_id]})
**Timeframe:** #{meta[:start_date]} to #{meta[:end_date]}
**Dataset:** #{meta[:point_count].to_s.reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse} points
**Environment:** Ruby #{meta[:ruby_version]}, Rails #{meta[:rails_version]}, #{meta[:database_adapter]}
## Summary
This benchmark compares three approaches to track generation:
- **Original:** Individual PostGIS queries for each distance calculation
- **V1 (LAG + Ruby):** PostgreSQL LAG for batch distance calculation, Ruby segmentation
- **V2 (Full SQL):** Complete segmentation using PostgreSQL window functions
## Results
| Approach | Status | Time | Tracks | Memory | Queries | Query Time |
|----------|--------|------|--------|--------|---------|------------|
MD
[:original, :v1, :v2].each do |approach|
next unless results[approach]
result = results[approach]
status = result[:success] ? "" : ""
report += "| #{result[:approach]} | #{status} | #{format_duration(result[:execution_time])} | #{result[:tracks_created]} | +#{result[:memory_increase].round(1)}MB | #{result[:query_count]} | #{result[:query_time_ms].round(1)}ms |\n"
end
if analysis[:v1] || analysis[:v2]
report += "\n## Performance Improvements\n\n"
if analysis[:v1]
v1 = analysis[:v1]
report += "### V1 (LAG + Ruby) vs Original\n"
report += "- **#{v1[:speedup_factor]}x faster** execution\n"
report += "- **#{v1[:query_reduction_percent]}% fewer** database queries\n"
report += "- **#{format_duration(v1[:execution_time_saved])} time saved**\n"
report += "- Memory change: #{v1[:memory_change_percent] > 0 ? '+' : ''}#{v1[:memory_change_percent]}%\n\n"
end
if analysis[:v2]
v2 = analysis[:v2]
report += "### V2 (Full SQL) vs Original\n"
report += "- **#{v2[:speedup_factor]}x faster** execution\n"
report += "- **#{v2[:query_reduction_percent]}% fewer** database queries\n"
report += "- **#{format_duration(v2[:execution_time_saved])} time saved**\n"
report += "- Memory change: #{v2[:memory_change_percent] > 0 ? '+' : ''}#{v2[:memory_change_percent]}%\n\n"
end
end
# Add detailed results
report += "## Detailed Results\n\n"
[:original, :v1, :v2].each do |approach|
next unless results[approach]
result = results[approach]
report += "### #{result[:approach]}\n\n"
if result[:success]
report += "- ✅ **Status:** Success\n"
report += "- ⏱️ **Execution Time:** #{format_duration(result[:execution_time])}\n"
report += "- 🛤️ **Tracks Created:** #{result[:tracks_created]}\n"
report += "- 💾 **Memory Increase:** +#{result[:memory_increase].round(1)}MB\n"
report += "- 🗄️ **Database Queries:** #{result[:query_count]}\n"
report += "- ⚡ **Query Time:** #{result[:query_time_ms].round(1)}ms\n"
if result[:query_count] > 0
avg_query_time = (result[:query_time_ms] / result[:query_count]).round(2)
report += "- 📊 **Average Query Time:** #{avg_query_time}ms\n"
end
else
report += "- ❌ **Status:** Failed\n"
report += "- 🚨 **Error:** #{result[:error]}\n"
end
report += "\n"
end
report += "## Recommendations\n\n"
if analysis[:v2] && analysis[:v2][:speedup_factor] > analysis.dig(:v1, :speedup_factor).to_f
report += "🚀 **V2 (Full SQL)** shows the best performance with #{analysis[:v2][:speedup_factor]}x speedup.\n\n"
report += "Benefits:\n"
report += "- Minimal database queries (#{results.dig(:v2, :query_count)} vs #{results.dig(:original, :query_count)})\n"
report += "- Fastest execution time\n"
report += "- Leverages PostgreSQL's optimized window functions\n\n"
elsif analysis[:v1]
report += "🏃 **V1 (LAG + Ruby)** provides good performance improvements with #{analysis[:v1][:speedup_factor]}x speedup.\n\n"
end
if results[:original] && results[:original][:query_count] > 50000
report += "⚠️ **Current implementation** makes excessive database queries (#{results[:original][:query_count]}) for this dataset size.\n\n"
end
report += "---\n*Generated by TracksOptimizationBenchmark*"
report
end
# Helper methods
def get_memory_mb
`ps -o rss= -p #{Process.pid}`.to_i / 1024.0
end
def format_duration(seconds)
if seconds < 60
"#{seconds.round(1)}s"
else
minutes = (seconds / 60).floor
remaining_seconds = (seconds % 60).round(1)
"#{minutes}m #{remaining_seconds}s"
end
end
def build_path(points)
Tracks::BuildPath.new(points).call
end
def calculate_average_speed(distance_in_meters, duration_seconds)
return 0.0 if duration_seconds <= 0 || distance_in_meters <= 0
speed_mps = distance_in_meters.to_f / duration_seconds
(speed_mps * 3.6).round(2)
end
def calculate_elevation_stats(points)
altitudes = points.map(&:altitude).compact
return { gain: 0, loss: 0, max: 0, min: 0 } if altitudes.empty?
elevation_gain = 0
elevation_loss = 0
previous_altitude = altitudes.first
altitudes[1..].each do |altitude|
diff = altitude - previous_altitude
if diff > 0
elevation_gain += diff
else
elevation_loss += diff.abs
end
previous_altitude = altitude
end
{ gain: elevation_gain.round, loss: elevation_loss.round, max: altitudes.max, min: altitudes.min }
end
end
# Simple query monitor for this benchmark
class QueryMonitor
attr_reader :query_count, :total_time_ms
def initialize
@query_count = 0
@total_time_ms = 0
end
def start
@subscription = ActiveSupport::Notifications.subscribe('sql.active_record') do |*args|
event = ActiveSupport::Notifications::Event.new(*args)
next if event.payload[:name]&.include?('SCHEMA')
@query_count += 1
@total_time_ms += event.duration
end
end
def stop
ActiveSupport::Notifications.unsubscribe(@subscription) if @subscription
end
end
# Command line interface
if __FILE__ == $0
if ARGV.length < 3
puts "Usage: rails runner #{__FILE__} USER_ID START_DATE END_DATE"
puts ""
puts "Example:"
puts " rails runner #{__FILE__} 1 2024-01-01 2024-01-31"
exit 1
end
user_id = ARGV[0].to_i
start_date = ARGV[1]
end_date = ARGV[2]
benchmark = TracksOptimizationBenchmark.new(user_id, start_date, end_date)
results = benchmark.run_all_benchmarks
puts "\n🎉 Benchmark completed! Check results above."
end

View File

@@ -0,0 +1,235 @@
# Tracks Feature Performance Optimization Options
## Current State Analysis
### Performance Characteristics
- **Time Complexity:** O(n log n) where n = number of GPS points
- **Memory Usage:** Loads entire dataset into memory (~200-400 bytes per point)
- **Processing Mode:** Single-threaded, sequential segmentation
- **Database Load:** Multiple PostGIS distance calculations per point pair
### Performance Estimates (Bulk Mode)
| Points | Processing Time | Memory Usage | Database Load |
|--------|----------------|--------------|---------------|
| 10K | 30-60 seconds | ~50 MB | Low |
| 100K | 5-15 minutes | ~200 MB | Medium |
| 1M+ | 30-90 minutes | 400+ MB | High |
### Current Bottlenecks
1. **Memory constraints** - Loading all points at once
2. **PostGIS distance calculations** - Sequential, not optimized
3. **Single-threaded processing** - No parallelization
4. **No progress indication** - Users can't track long-running operations
---
## Optimization Options
### Option 1: Enhanced Time-Based Batching
**Complexity:** Low | **Impact:** High | **Risk:** Low
#### Implementation
- Extend existing `:daily` mode with configurable batch sizes
- Add 1-point overlap between batches to maintain segmentation accuracy
- Implement batch-aware progress reporting
#### Benefits
- **Memory reduction:** 90%+ reduction (from 400MB to ~40MB for 1M points)
- **Better UX:** Progress indication and cancellation support
- **Incremental processing:** Can resume interrupted operations
- **Lower DB pressure:** Smaller query result sets
#### Changes Required
```ruby
# Enhanced generator with configurable batching
Tracks::Generator.new(
user,
mode: :batched,
batch_size: 24.hours,
enable_overlap: true
).call
```
#### Edge Cases to Handle
- Tracks spanning batch boundaries (solved with overlap)
- Midnight-crossing tracks in daily mode
- Deduplication of overlapping segments
---
### Option 2: Spatial Indexing Optimization
**Complexity:** Medium | **Impact:** Medium | **Risk:** Low
#### Implementation
- Replace individual PostGIS calls with batch distance calculations
- Implement spatial clustering for nearby points before segmentation
- Use PostGIS window functions for distance calculations
#### Benefits
- **Faster distance calculations:** Batch operations vs individual queries
- **Reduced DB round-trips:** Single query for multiple distance calculations
- **Better index utilization:** Leverage existing spatial indexes
#### Changes Required
```sql
-- Batch distance calculation approach
WITH point_distances AS (
SELECT
id,
timestamp,
ST_Distance(
lonlat::geography,
LAG(lonlat::geography) OVER (ORDER BY timestamp)
) as distance_to_previous
FROM points
WHERE user_id = ?
ORDER BY timestamp
)
SELECT * FROM point_distances WHERE distance_to_previous > ?
```
---
### Option 3: Parallel Processing with Worker Pools
**Complexity:** High | **Impact:** High | **Risk:** Medium
#### Implementation
- Split large datasets into non-overlapping time ranges
- Process multiple batches in parallel using Sidekiq workers
- Implement coordination mechanism for dependent segments
#### Benefits
- **Faster processing:** Utilize multiple CPU cores
- **Scalable:** Performance scales with worker capacity
- **Background processing:** Non-blocking for users
#### Challenges
- **Complex coordination:** Managing dependencies between batches
- **Resource competition:** Multiple workers accessing same user's data
- **Error handling:** Partial failure scenarios
#### Architecture
```ruby
# Parallel processing coordinator
class Tracks::ParallelGenerator
def call
time_ranges = split_into_parallel_ranges
time_ranges.map do |range|
Tracks::BatchProcessorJob.perform_later(user_id, range)
end
end
end
```
---
### Option 4: Incremental Algorithm Enhancement
**Complexity:** Medium | **Impact:** Medium | **Risk:** Medium
#### Implementation
- Enhance existing `:incremental` mode with smarter buffering
- Implement sliding window approach for active track detection
- Add automatic track finalization based on time gaps
#### Benefits
- **Real-time processing:** Process points as they arrive
- **Lower memory footprint:** Only active segments in memory
- **Better for live tracking:** Immediate track updates
#### Current Limitations
- Existing incremental mode processes untracked points only
- No automatic track finalization
- Limited to single active track per user
---
### Option 5: Database-Level Optimization
**Complexity:** Low-Medium | **Impact:** Medium | **Risk:** Low
#### Implementation
- Add composite indexes for common query patterns
- Implement materialized views for expensive calculations
- Use database-level segmentation logic
#### Benefits
- **Faster queries:** Better index utilization
- **Reduced Ruby processing:** Move logic to database
- **Consistent performance:** Database optimizations benefit all modes
#### Proposed Indexes
```sql
-- Optimized for bulk processing
CREATE INDEX CONCURRENTLY idx_points_user_timestamp_track
ON points(user_id, timestamp) WHERE track_id IS NULL;
-- Optimized for incremental processing
CREATE INDEX CONCURRENTLY idx_points_untracked_timestamp
ON points(timestamp) WHERE track_id IS NULL;
```
---
## Recommended Implementation Strategy
### Phase 1: Quick Wins (Week 1-2)
1. **Implement Enhanced Time-Based Batching** (Option 1)
- Extend existing daily mode with overlap
- Add progress reporting
- Configurable batch sizes
### Phase 2: Database Optimization (Week 3)
2. **Add Database-Level Optimizations** (Option 5)
- Create optimized indexes
- Implement batch distance calculations
### Phase 3: Advanced Features (Week 4-6)
3. **Spatial Indexing Optimization** (Option 2)
- Replace individual distance calculations
- Implement spatial clustering
### Phase 4: Future Enhancements
4. **Parallel Processing** (Option 3) - Consider for v2
5. **Incremental Enhancement** (Option 4) - For real-time features
---
## Risk Assessment
### Low Risk
- **Time-based batching:** Builds on existing daily mode
- **Database indexes:** Standard optimization technique
- **Progress reporting:** UI enhancement only
### Medium Risk
- **Spatial optimization:** Requires careful testing of distance calculations
- **Incremental enhancement:** Changes to existing algorithm logic
### High Risk
- **Parallel processing:** Complex coordination, potential race conditions
- **Major algorithm changes:** Could introduce segmentation bugs
---
## Success Metrics
### Performance Targets
- **Memory usage:** < 100MB for datasets up to 1M points
- **Processing time:** < 10 minutes for 1M points
- **User experience:** Progress indication and cancellation
### Monitoring Points
- Database query performance
- Memory consumption during processing
- User-reported processing times
- Track generation accuracy (no regression)
---
## Next Steps
1. **Choose initial approach** based on urgency and resources
2. **Create feature branch** for selected optimization
3. **Implement comprehensive testing** including edge cases
4. **Monitor performance** in staging environment
5. **Gradual rollout** with feature flags