#!/usr/bin/env python3

# this program analyses the trace files generated when running with the
# --vips-profile flag
#
# for example:
#
#     $ vips sharpen ~/pics/k2.jpg x.jpg --vips-profile
#     recording profile in vips-profile.txt
#     $ vipsprofile
#     reading from vips-profile.txt
#     loaded 30001 events
#     total time = 0.082064
#     name             alive wait%   work%   unkn%     mem   peakm
#         worker 40	  0.01	31.8	56.9	11.3	 3.4	 3.4
#         worker 42	  0.02	37.8	42.1	20.1	 3.4	 3.4
#     ...
#     writing to vips-profile.svg

import re
import cairo
from io import open

class ReadFile:
    def __init__(self, filename):
        self.filename = filename

    def __enter__(self):
        self.f = open(self.filename, 'r', encoding='utf-8')
        self.lineno = 0
        self.getnext();
        return self

    def __exit__(self, type, value, traceback):
        self.f.close()

    def __bool__(self):
        return self.line != ""

    __nonzero__ = __bool__

    def getnext(self):
        self.lineno += 1
        self.line = self.f.readline()

def read_times(rf):
    times = []

    while True:
        match = re.match('[+-]?[0-9]+ ', rf.line)
        if not match:
            break
        times += [int(x) for x in re.split(' ', rf.line.rstrip())]
        rf.getnext()

    return times[::-1]

class Thread:
    thread_number = 0

    def __init__(self, thread_name):
        # no one cares about the thread address
        match = re.match(r'(.*) \(0x.*?\) (.*)', thread_name)
        if match:
            thread_name = match.group(1) + " " + match.group(2)

        self.thread_name = thread_name
        self.thread_number = Thread.thread_number
        self.all_events = []
        self.workwait_events = []
        self.memory_events = []
        self.other_events = []
        Thread.thread_number += 1

all_events = []

class Event:
    def __init__(self, thread, gate_location, gate_name, start, stop):
        self.thread = thread
        self.gate_location = gate_location
        self.gate_name = gate_name

        self.work = False
        self.wait = False
        self.memory = False
        if gate_location == "memory":
            self.memory = True
        elif re.match('.*work.*', gate_name):
            self.work = True
        elif re.match('.*wait.*', gate_name):
            self.wait = True

        if self.memory:
            self.start = start
            self.stop = start
            self.size = stop
        else:
            self.start = start
            self.stop = stop

        thread.all_events.append(self)
        all_events.append(self)
        if self.wait or self.work:
            thread.workwait_events.append(self)
        elif self.memory:
            thread.memory_events.append(self)
        else:
            thread.other_events.append(self)

input_filename = 'vips-profile.txt'

thread_id = 0
threads = []
n_events = 0
print('reading from', input_filename)
with ReadFile(input_filename) as rf:
    while rf:
        if rf.line.rstrip() == "":
            rf.getnext()
            continue
        if rf.line[0] == "#":
            rf.getnext()
            continue

        match = re.match('thread: (.*)', rf.line)
        if not match:
            print('parse error line %d, expected "thread"' % rf.lineno)
        thread_name = match.group(1) + " " + str(thread_id)
        thread_id += 1
        thread = Thread(thread_name)
        threads.append(thread)
        rf.getnext()

        while True:
            match = re.match('^gate: (.*?)(: (.*))?$', rf.line)
            if not match:
                break
            gate_location = match.group(1)
            gate_name = match.group(3)
            rf.getnext()

            match = re.match('start:', rf.line)
            if not match:
                continue
            rf.getnext()

            start = read_times(rf)

            match = re.match('stop:', rf.line)
            if not match:
                continue
            rf.getnext()

            stop = read_times(rf)

            if len(start) != len(stop):
                print('start and stop length mismatch')

            for a, b in zip(start, stop):
                Event(thread, gate_location, gate_name, a, b)
                n_events += 1

for thread in threads:
    thread.all_events.sort(key=lambda x: x.start)
    thread.workwait_events.sort(key=lambda x: x.start)
    thread.memory_events.sort(key=lambda x: x.start)
    thread.other_events.sort(key=lambda x: x.start)

all_events.sort(key=lambda x: x.start)

print(f'loaded {n_events} events')

# move time axis to secs of computation
ticks_per_sec = 1000000.0
first_time = all_events[0].start
last_time = 0
for event in all_events:
    if event.start < first_time:
        first_time = event.start
    if event.stop > last_time:
        last_time = event.stop

for event in all_events:
    event.start = (event.start - first_time) / ticks_per_sec
    event.stop = (event.stop - first_time) / ticks_per_sec

last_time = (last_time - first_time) / ticks_per_sec
first_time = 0

print(f'total time = {last_time}')

# calculate some simple stats
for thread in threads:
    thread.start = last_time
    thread.stop = 0
    thread.wait = 0
    thread.work = 0
    thread.mem = 0
    thread.peak_mem = 0
    for event in thread.all_events:
        if event.start < thread.start:
            thread.start = event.start
        if event.stop > thread.stop:
            thread.stop = event.stop
        if event.wait:
            thread.wait += event.stop - event.start
        if event.work:
            thread.work += event.stop - event.start
        if event.memory:
            thread.mem += event.size
            if thread.mem > thread.peak_mem:
                thread.peak_mem = thread.mem

    thread.alive = thread.stop - thread.start

    # hide very short-lived threads
    thread.hide = thread.alive < 0.01

print('name             alive wait%   work%   unkn%     mem   peakm')
for thread in threads:
    if thread.hide:
        continue

    wait_percent = 100 * thread.wait / thread.alive
    work_percent = 100 * thread.work / thread.alive
    unkn_percent = 100 - 100 * (thread.work + thread.wait) / thread.alive

    print((f'{thread.thread_name:>13}\t{thread.alive:6.2f}\t'
           f'{wait_percent:>4.1f}\t{work_percent:>4.1f}\t{unkn_percent:>4.1f}\t'
           f'{thread.mem / (1024 * 1024):>4.1f}\t'
           f'{thread.peak_mem / (1024 * 1024):>4.1f}'))

mem = 0
peak_mem = 0
for event in all_events:
    if event.memory:
        mem += event.size
        if mem > peak_mem:
            peak_mem = mem

print(f'peak memory = {peak_mem / (1024 * 1024):.1f} MB')
if mem != 0:
    print(f'leak! final memory = {mem / (1024 * 1024):.1f} MB')

# does a list of events contain an overlap?
# assume the list of events has been sorted by start time
def events_overlap(events):
    for i in range(0, len(events) - 1):
        # we can't just test for stop1 > start2 since one (or both) events
        # might have duration zero
        event1 = events[i]
        event2 = events[i + 1]
        overlap_start = max(event1.start, event2.start)
        overlap_stop = min(event1.stop, event2.stop)
        if overlap_stop - overlap_start > 0:
            return True

    return False

# do the events on two gates overlap?
def gates_overlap(events, gate_name1, gate_name2):
    merged = []

    for event in events:
        if event.gate_name == gate_name1 or event.gate_name == gate_name2:
            merged.append(event)

    merged.sort(key=lambda x: x.start)

    return events_overlap(merged)

# show top 10 waits
wait = {}
for thread in threads:
    for event in thread.all_events:
        if event.wait:
            name = f'{event.gate_location}::{event.gate_name}'
            if name not in wait:
                wait[name] = 0

            wait[name] += event.stop - event.start

print('name  wait')
for [name, time] in sorted(wait.items(), reverse=True, key=lambda x: x[1])[:10]:
    print(f'{name:>35}\t{time:.2f}')

# allocate a y position for each gate
total_y = 0
for thread in threads:
    if thread.hide:
        continue

    thread.total_y = total_y

    gate_positions = {}

    # first pass .. move work and wait events to y == 0
    if events_overlap(thread.workwait_events):
        print('gate overlap on thread', thread.thread_name)
        for i in range(0, len(thread.workwait_events) - 1):
            event1 = thread.workwait_events[i]
            event2 = thread.workwait_events[i + 1]
            overlap_start = max(event1.start, event2.start)
            overlap_stop = min(event1.stop, event2.stop)
            if overlap_stop - overlap_start > 0:
                print('overlap:')
                print('event', event1.gate_location, event1.gate_name, end=' ')
                print('starts at', event1.start, 'stops at', event1.stop)
                print('event', event2.gate_location, event2.gate_name, end=' ')
                print('starts at', event2.start, 'stops at', event2.stop)

    for event in thread.workwait_events:
        gate_positions[event.gate_name] = 0
        event.y = 0
        event.total_y = total_y

    for event in thread.memory_events:
        gate_positions[event.gate_name] = 0
        event.y = 0
        event.total_y = total_y

    # second pass: move all other events to non-overlapping ys
    y = 1
    for event in thread.other_events:
        if not event.gate_name in gate_positions:
            # look at all the ys we've allocated previously and see if we can
            # add this gate to one of them
            for gate_y in range(1, y):
                found_overlap = False
                for gate_name in gate_positions:
                    if gate_positions[gate_name] != gate_y:
                        continue

                    if gates_overlap(thread.other_events,
                            event.gate_name, gate_name):
                        found_overlap = True
                        break

                if not found_overlap:
                    gate_positions[event.gate_name] = gate_y
                    break

            # failure? add a new y
            if not event.gate_name in gate_positions:
                gate_positions[event.gate_name] = y
                y += 1

        event.y = gate_positions[event.gate_name]

    # third pass: flip the order of the ys to get the lowest-level ones at the
    # top, next to the wait/work line
    for event in thread.other_events:
        event.y = y - event.y
        event.total_y = total_y + event.y

    total_y += y

PIXELS_PER_SECOND = 1000
PIXELS_PER_GATE = 20
LEFT_BORDER = 130
BAR_HEIGHT = 5
MEM_HEIGHT = 100
WIDTH = int(LEFT_BORDER + last_time * PIXELS_PER_SECOND) + 20
HEIGHT = int(total_y * PIXELS_PER_GATE) + MEM_HEIGHT + 30

output_filename = "vips-profile.svg"
print('writing to', output_filename)

surface = cairo.SVGSurface(output_filename, WIDTH, HEIGHT)

ctx = cairo.Context(surface)
ctx.select_font_face('Sans')
ctx.set_font_size(15)

ctx.rectangle(0, 0, WIDTH, HEIGHT)
ctx.set_source_rgba(0.0, 0.0, 0.3, 1.0)
ctx.fill()

def draw_event(ctx, event):
    left = event.start * PIXELS_PER_SECOND + LEFT_BORDER
    top = event.total_y * PIXELS_PER_GATE + BAR_HEIGHT // 2
    width = (event.stop - event.start) * PIXELS_PER_SECOND
    height = BAR_HEIGHT

    if event.memory:
        width = 1
        height /= 2
        top += BAR_HEIGHT

    ctx.rectangle(left, top, width, height)

    if event.wait:
        ctx.set_source_rgb(0.9, 0.1, 0.1)
    elif event.work:
        ctx.set_source_rgb(0.1, 0.9, 0.1)
    elif event.memory:
        ctx.set_source_rgb(1.0, 1.0, 1.0)
    else:
        ctx.set_source_rgb(0.1, 0.1, 0.9)

    ctx.fill()
    if not event.wait and not event.work and not event.memory:
        xbearing, ybearing, twidth, theight, xadvance, yadvance = \
                ctx.text_extents(event.gate_name)
        ctx.move_to(left + width // 2 - twidth // 2, top + 3 * BAR_HEIGHT)
        ctx.set_source_rgb(1.00, 0.83, 0.00)
        ctx.show_text(event.gate_name)

for thread in threads:
    if thread.hide:
        continue

    ctx.rectangle(0, thread.total_y * PIXELS_PER_GATE, WIDTH, 1)
    ctx.set_source_rgb(1.00, 1.00, 1.00)
    ctx.fill()

    xbearing, ybearing, twidth, theight, xadvance, yadvance = \
            ctx.text_extents(thread.thread_name)
    ctx.move_to(0, theight + thread.total_y * PIXELS_PER_GATE + BAR_HEIGHT // 2)
    ctx.set_source_rgb(1.00, 1.00, 1.00)
    ctx.show_text(thread.thread_name)

    for event in thread.all_events:
        draw_event(ctx, event)

memory_y = total_y * PIXELS_PER_GATE

label = "memory"
xbearing, ybearing, twidth, theight, xadvance, yadvance = \
        ctx.text_extents(label)
ctx.move_to(0, memory_y + theight + 8)
ctx.set_source_rgb(1.00, 1.00, 1.00)
ctx.show_text(label)

mem = 0
ctx.move_to(LEFT_BORDER, memory_y + MEM_HEIGHT)

for event in all_events:
    if event.memory:
        mem += event.size

        left = LEFT_BORDER + event.start * PIXELS_PER_SECOND
        top = memory_y + MEM_HEIGHT - (MEM_HEIGHT * mem / peak_mem)

        ctx.line_to(left, top)

ctx.set_line_width(1)
ctx.set_source_rgb(1.00, 1.00, 1.00)
ctx.stroke()

axis_y = total_y * PIXELS_PER_GATE + MEM_HEIGHT

ctx.rectangle(LEFT_BORDER, axis_y, last_time * PIXELS_PER_SECOND, 1)
ctx.set_source_rgb(1.00, 1.00, 1.00)
ctx.fill()

label = "time"
xbearing, ybearing, twidth, theight, xadvance, yadvance = \
        ctx.text_extents(label)
ctx.move_to(0, axis_y + theight + 8)
ctx.set_source_rgb(1.00, 1.00, 1.00)
ctx.show_text(label)

for t in range(0, int(last_time * PIXELS_PER_SECOND), PIXELS_PER_SECOND // 10):
    left = t + LEFT_BORDER
    top = axis_y

    ctx.rectangle(left, top, 1, 5)
    ctx.set_source_rgb(1.00, 1.00, 1.00)
    ctx.fill()

    label = str(t / PIXELS_PER_SECOND)
    xbearing, ybearing, twidth, theight, xadvance, yadvance = \
            ctx.text_extents(label)
    ctx.move_to(left - twidth // 2, top + theight + 8)
    ctx.set_source_rgb(1.00, 1.00, 1.00)
    ctx.show_text(label)

surface.finish()
