Fine tune the artificial delays in the render loop

There are now two numbers, repaint_delay and input_delay that control how often the screen is repainted and how frequently input received from the child process is processed. This halves the CPU usage in intensive cases such as scrolling a file in less. The CPU usage of kitty + X when scrolling is now significantly lower than all the other terminals on my system. MROAWR! ...
2017-09-16 08:10:19 +05:30 · 2017-09-16 08:10:19 +05:30 · 728f33700a
commit 728f33700a
parent 43ebddc28f
8 changed files with 56 additions and 47 deletions
--- a/README.asciidoc
+++ b/README.asciidoc
@ -124,7 +124,7 @@ or a similar package manager)
 kitty is designed for power keyboard users. To that end all its controls
 work with the keyboard (although it fully supports mouse interactions as
 well). Its configuration is a simple, human editable, single file for
-easy reproducability (I like to store config files in source control).
+easy reproducibility (I like to store config files in source control).
 The code in kitty is designed to be simple, modular and hackable. It is
 written in a mix of C (for performance sensitive parts) and Python (for
@ -296,10 +296,15 @@ link:kitty/kitty.conf[config file].
 == Performance
 The main goals for kitty performance are user perceived latency while typing
-and "smoothness" while scrolling. kitty tries hard to optimize these. To that
+and "smoothness" while scrolling as well as CPU usage. kitty tries hard to find
-end it keeps a cache of each rendered glyph in video RAM so that font rendering
+an optimum balance for these. To that end it keeps a cache of each rendered
-is not a bottleneck.  Interaction with child programs takes place in a separate
+glyph in video RAM so that font rendering is not a bottleneck.  Interaction
-thread from rendering, to improve smoothness.
+with child programs takes place in a separate thread from rendering, to improve
 smoothness.
 There are two parameters you can tune to adjust the performance. ``repaint_delay``
 and ``input_delay``. These control the artificial delays introduced into the
 render loop to reduce CPU usage. See the link:kitty/kitty.conf[config file] for details.
 You can generate detailed per-function performance data using
 link:https://github.com/gperftools/gperftools[gperftools]. Build kitty with the
--- a/kitty/boss.py
+++ b/kitty/boss.py
@ -60,7 +60,7 @@ class Boss:
        self.glfw_window_title = None
        self.shutting_down = False
        self.child_monitor = ChildMonitor(
-            opts.repaint_delay / 1000.0, glfw_window.window_id(),
+            glfw_window.window_id(),
            self.on_child_death,
            DumpCommands(args) if args.dump_commands or args.dump_bytes else None)
        set_boss(self)
--- a/kitty/child-monitor.c
+++ b/kitty/child-monitor.c
@ -30,6 +30,7 @@ extern int pthread_setname_np(const char *name);
 #include <GLFW/glfw3.h>
 #define EXTRA_FDS 2
 #define wakeup_main_loop glfwPostEmptyEvent
 static void (*parse_func)(Screen*, PyObject*);
@ -123,10 +124,9 @@ new(PyTypeObject *type, PyObject *args, PyObject UNUSED *kwds) {
    ChildMonitor *self;
    PyObject *dump_callback, *death_notify, *wid; 
    int ret;
    double repaint_delay;
    if (the_monitor) { PyErr_SetString(PyExc_RuntimeError, "Can have only a single ChildMonitor instance"); return NULL; }
-    if (!PyArg_ParseTuple(args, "dOOO", &repaint_delay, &wid, &death_notify, &dump_callback)) return NULL; 
+    if (!PyArg_ParseTuple(args, "OOO", &wid, &death_notify, &dump_callback)) return NULL; 
    glfw_window_id = PyLong_AsVoidPtr(wid);
    if ((ret = pthread_mutex_init(&children_lock, NULL)) != 0) {
        PyErr_Format(PyExc_RuntimeError, "Failed to create children_lock mutex: %s", strerror(ret));
@ -148,7 +148,6 @@ new(PyTypeObject *type, PyObject *args, PyObject UNUSED *kwds) {
    self->count = 0; 
    fds[0].fd = wakeup_fds[0]; fds[1].fd = signal_fds[0];
    fds[0].events = POLLIN; fds[1].events = POLLIN;
    self->repaint_delay = repaint_delay;
    the_monitor = self;
    return (PyObject*) self;
@ -175,7 +174,7 @@ dealloc(ChildMonitor* self) {
 }
 static void
-wakeup_() {
+wakeup_io_loop() {
    while(true) {
        ssize_t ret = write(wakeup_fds[1], "w", 1);
        if (ret < 0) {
@ -208,7 +207,7 @@ join(ChildMonitor *self) {
 static PyObject *
 wakeup(ChildMonitor UNUSED *self) {
 #define wakeup_doc "wakeup() -> wakeup the ChildMonitor I/O thread, forcing it to exit from poll() if it is waiting there."
-    wakeup_();
+    wakeup_io_loop();
    Py_RETURN_NONE;
 }
@ -258,7 +257,7 @@ schedule_write_to_child(unsigned long id, const char *data, size_t sz) {
                screen->write_buf = PyMem_RawRealloc(screen->write_buf, screen->write_buf_sz);
                if (screen->write_buf == NULL) { fatal("Out of memory."); }
            }
-            if (screen->write_buf_used) wakeup_();
+            if (screen->write_buf_used) wakeup_io_loop();
            screen_mutex(unlock, write);
            break;
        }
@ -286,31 +285,26 @@ shutdown(ChildMonitor *self) {
    Py_RETURN_NONE;
 }
-static inline bool
+static inline void
-do_parse(ChildMonitor *self, Screen *screen) {
+do_parse(ChildMonitor *self, Screen *screen, double now) {
    bool updated = false;
    screen_mutex(lock, read);
    if (screen->read_buf_sz) {
-        parse_func(screen, self->dump_callback);
+        double time_since_new_input = now - screen->new_input_at;
-        if (screen->read_buf_sz >= READ_BUF_SZ) wakeup_();  // Ensure the read fd has POLLIN set
+        if (time_since_new_input >= OPT(input_delay)) {
-        screen->read_buf_sz = 0;
+            parse_func(screen, self->dump_callback);
-        updated = true;
+            if (screen->read_buf_sz >= READ_BUF_SZ) wakeup_io_loop();  // Ensure the read fd has POLLIN set
            screen->read_buf_sz = 0;
            screen->new_input_at = 0;
        } else set_maximum_wait(OPT(input_delay) - time_since_new_input);
    }
    screen_mutex(unlock, read);
    if (LIKELY(updated)) {
        glfwPostEmptyEvent();
    }
    return updated;
 }
 static double last_parse_at = -1000;
 static void
 parse_input(ChildMonitor *self) {
    // Parse all available input that was read in the I/O thread.
    size_t count = 0, remove_count = 0;
    double now = monotonic();
    double time_since_last_parse = now - last_parse_at; 
    bool parse_needed = time_since_last_parse >= self->repaint_delay ? true : false;
    children_mutex(lock);
    while (remove_queue_count) {
        remove_queue_count--; 
@ -321,15 +315,11 @@ parse_input(ChildMonitor *self) {
    if (UNLIKELY(signal_received)) {
        glfwSetWindowShouldClose(glfw_window_id, true);
        glfwPostEmptyEvent();
    } else {
-        if (parse_needed) {
+        count = self->count;
-            count = self->count;
+        for (size_t i = 0; i < count; i++) {
-            for (size_t i = 0; i < count; i++) {
+            scratch[i] = children[i];
-                scratch[i] = children[i];
+            INCREF_CHILD(scratch[i]);
                INCREF_CHILD(scratch[i]);
            }
            last_parse_at = now;
        }
    }
    children_mutex(unlock);
@ -345,13 +335,10 @@ parse_input(ChildMonitor *self) {
    for (size_t i = 0; i < count; i++) {
        if (!scratch[i].needs_removal) {
-            do_parse(self, scratch[i].screen);
+            do_parse(self, scratch[i].screen, now);
        }
        DECREF_CHILD(scratch[i]);
    }
    if (!parse_needed) {
        set_maximum_wait(self->repaint_delay - time_since_last_parse);
    } 
 }
 static PyObject *
@ -494,9 +481,9 @@ render_cursor(Window *w, double now) {
 }
 static inline bool
-render(ChildMonitor *self, double now) {
+render(double now) {
    double time_since_last_render = now - last_render_at;
-    if (time_since_last_render > self->repaint_delay) {
+    if (time_since_last_render > OPT(repaint_delay)) {
        draw_borders();
 #define TD global_state.tab_bar_render_data
        if (TD.screen && global_state.num_tabs > 1) draw_cells(TD.vao_idx, TD.xstart, TD.ystart, TD.dx, TD.dy, TD.screen);
@ -536,7 +523,7 @@ render(ChildMonitor *self, double now) {
        glfwSwapBuffers(glfw_window_id);
        last_render_at = now;
    } else {
-        set_maximum_wait(self->repaint_delay - time_since_last_render);
+        set_maximum_wait(OPT(repaint_delay) - time_since_last_render);
    }
    return true;
 }
@ -595,7 +582,7 @@ main_loop(ChildMonitor *self) {
    while (!glfwWindowShouldClose(glfw_window_id)) {
        double now = monotonic();
        maximum_wait = -1;
-        if (!render(self, now)) break;
+        if (!render(now)) break;
        if (global_state.mouse_visible && OPT(mouse_hide_wait) > 0 && now - global_state.last_mouse_activity_at > OPT(mouse_hide_wait)) {
            glfwSetInputMode(glfw_window_id, GLFW_CURSOR, GLFW_CURSOR_HIDDEN);
            global_state.mouse_visible = false;
@ -714,6 +701,7 @@ read_bytes(int fd, Screen *screen) {
        break;
    }
    if (UNLIKELY(len == 0)) return false;
    if (screen->new_input_at == 0) screen->new_input_at = monotonic();
    screen_mutex(lock, read);
    if (orig_sz != screen->read_buf_sz) {
        // The other thread consumed some of the screen read buffer
@ -828,7 +816,7 @@ io_loop(void *data) {
                perror("Call to poll() failed");
            }
        }
-        if (data_received) glfwPostEmptyEvent();
+        if (data_received) wakeup_main_loop();
    }
    children_mutex(lock);
    for (i = 0; i < self->count; i++) children[i].needs_removal = true;
--- a/kitty/config.py
+++ b/kitty/config.py
@ -215,6 +215,7 @@ type_map = {
    'cursor_opacity': to_opacity,
    'open_url_modifiers': to_open_url_modifiers,
    'repaint_delay': positive_int,
    'input_delay': positive_int,
    'window_border_width': positive_float,
    'window_margin_width': positive_float,
    'window_padding_width': positive_float,
--- a/kitty/data-types.h
+++ b/kitty/data-types.h
@ -251,6 +251,7 @@ typedef struct {
    unsigned int parser_state, parser_text_start, parser_buf_pos;
    bool parser_has_pending_text;
    uint8_t read_buf[READ_BUF_SZ], *write_buf;
    double new_input_at;
    size_t read_buf_sz, write_buf_sz, write_buf_used;
    pthread_mutex_t read_buf_lock, write_buf_lock;
@ -267,7 +268,6 @@ typedef struct {
    PyObject_HEAD
    PyObject *dump_callback, *update_screen, *death_notify;
    double repaint_delay;
    unsigned int count;
    bool shutting_down;
    pthread_t io_thread;
--- a/kitty/kitty.conf
+++ b/kitty/kitty.conf
@ -105,11 +105,18 @@ remember_window_size   yes
 initial_window_width   640
 initial_window_height  400
-# Delay (in milliseconds) between screen updates. Decreasing it, increases fps
+# Delay (in milliseconds) between screen updates. Decreasing it, increases
-# at the cost of more CPU usage. The default value yields ~100fps which is more
+# frames-per-second (FPS) at the cost of more CPU usage. The default value
-# than sufficient for most uses.
+# yields ~100 FPS which is more than sufficient for most uses.
 repaint_delay    10
 # Delay (in milliseconds) before input from the program running in the terminal
 # is processed. Note that decreasing it will increase responsiveness, but also
 # increase CPU usage and might cause flicker in full screen programs that
 # redraw the entire screen on each loop, because kitty is so fast that partial
 # screen updates will be drawn.
 input_delay 3
 # Visual bell duration. Flash the screen when a bell occurs for the specified number of
 # seconds. Set to zero to disable.
 visual_bell_duration 0.0
--- a/kitty/state.c
+++ b/kitty/state.c
@ -126,6 +126,11 @@ color_as_int(PyObject *color) {
 #undef I
 }
 static inline double
 repaint_delay(PyObject *val) {
    return (double)(PyLong_AsUnsignedLong(val)) / 1000.0;
 }
 #define dict_iter(d) { \
    PyObject *key, *value; Py_ssize_t pos = 0; \
    while (PyDict_Next(d, &pos, &key, &value))
@ -155,6 +160,8 @@ PYWRAP1(set_options) {
    S(open_url_modifiers, PyLong_AsUnsignedLong);
    S(click_interval, PyFloat_AsDouble);
    S(url_color, color_as_int);
    S(repaint_delay, repaint_delay);
    S(input_delay, repaint_delay);
    PyObject *chars = PyObject_GetAttrString(args, "select_by_word_characters");
    if (chars == NULL) return NULL;
--- a/kitty/state.h
+++ b/kitty/state.h
@ -16,6 +16,7 @@ typedef struct {
    unsigned int open_url_modifiers;
    char_type select_by_word_characters[256]; size_t select_by_word_characters_count;
    color_type url_color;
    double repaint_delay, input_delay;
 } Options;
 typedef struct {