Fine tune the artificial delays in the render loop

There are now two numbers, repaint_delay and input_delay
that control how often the screen is repainted and how frequently
input received from the child process is processed.

This halves the CPU usage in intensive cases such as scrolling
a file in less. The CPU usage of kitty + X when scrolling is now
significantly lower than all the other terminals on my system.

MROAWR!

...
This commit is contained in:
Kovid Goyal 2017-09-16 08:10:19 +05:30
parent 43ebddc28f
commit 728f33700a
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
8 changed files with 56 additions and 47 deletions

View File

@ -124,7 +124,7 @@ or a similar package manager)
kitty is designed for power keyboard users. To that end all its controls kitty is designed for power keyboard users. To that end all its controls
work with the keyboard (although it fully supports mouse interactions as work with the keyboard (although it fully supports mouse interactions as
well). Its configuration is a simple, human editable, single file for well). Its configuration is a simple, human editable, single file for
easy reproducability (I like to store config files in source control). easy reproducibility (I like to store config files in source control).
The code in kitty is designed to be simple, modular and hackable. It is The code in kitty is designed to be simple, modular and hackable. It is
written in a mix of C (for performance sensitive parts) and Python (for written in a mix of C (for performance sensitive parts) and Python (for
@ -296,10 +296,15 @@ link:kitty/kitty.conf[config file].
== Performance == Performance
The main goals for kitty performance are user perceived latency while typing The main goals for kitty performance are user perceived latency while typing
and "smoothness" while scrolling. kitty tries hard to optimize these. To that and "smoothness" while scrolling as well as CPU usage. kitty tries hard to find
end it keeps a cache of each rendered glyph in video RAM so that font rendering an optimum balance for these. To that end it keeps a cache of each rendered
is not a bottleneck. Interaction with child programs takes place in a separate glyph in video RAM so that font rendering is not a bottleneck. Interaction
thread from rendering, to improve smoothness. with child programs takes place in a separate thread from rendering, to improve
smoothness.
There are two parameters you can tune to adjust the performance. ``repaint_delay``
and ``input_delay``. These control the artificial delays introduced into the
render loop to reduce CPU usage. See the link:kitty/kitty.conf[config file] for details.
You can generate detailed per-function performance data using You can generate detailed per-function performance data using
link:https://github.com/gperftools/gperftools[gperftools]. Build kitty with the link:https://github.com/gperftools/gperftools[gperftools]. Build kitty with the

View File

@ -60,7 +60,7 @@ class Boss:
self.glfw_window_title = None self.glfw_window_title = None
self.shutting_down = False self.shutting_down = False
self.child_monitor = ChildMonitor( self.child_monitor = ChildMonitor(
opts.repaint_delay / 1000.0, glfw_window.window_id(), glfw_window.window_id(),
self.on_child_death, self.on_child_death,
DumpCommands(args) if args.dump_commands or args.dump_bytes else None) DumpCommands(args) if args.dump_commands or args.dump_bytes else None)
set_boss(self) set_boss(self)

View File

@ -30,6 +30,7 @@ extern int pthread_setname_np(const char *name);
#include <GLFW/glfw3.h> #include <GLFW/glfw3.h>
#define EXTRA_FDS 2 #define EXTRA_FDS 2
#define wakeup_main_loop glfwPostEmptyEvent
static void (*parse_func)(Screen*, PyObject*); static void (*parse_func)(Screen*, PyObject*);
@ -123,10 +124,9 @@ new(PyTypeObject *type, PyObject *args, PyObject UNUSED *kwds) {
ChildMonitor *self; ChildMonitor *self;
PyObject *dump_callback, *death_notify, *wid; PyObject *dump_callback, *death_notify, *wid;
int ret; int ret;
double repaint_delay;
if (the_monitor) { PyErr_SetString(PyExc_RuntimeError, "Can have only a single ChildMonitor instance"); return NULL; } if (the_monitor) { PyErr_SetString(PyExc_RuntimeError, "Can have only a single ChildMonitor instance"); return NULL; }
if (!PyArg_ParseTuple(args, "dOOO", &repaint_delay, &wid, &death_notify, &dump_callback)) return NULL; if (!PyArg_ParseTuple(args, "OOO", &wid, &death_notify, &dump_callback)) return NULL;
glfw_window_id = PyLong_AsVoidPtr(wid); glfw_window_id = PyLong_AsVoidPtr(wid);
if ((ret = pthread_mutex_init(&children_lock, NULL)) != 0) { if ((ret = pthread_mutex_init(&children_lock, NULL)) != 0) {
PyErr_Format(PyExc_RuntimeError, "Failed to create children_lock mutex: %s", strerror(ret)); PyErr_Format(PyExc_RuntimeError, "Failed to create children_lock mutex: %s", strerror(ret));
@ -148,7 +148,6 @@ new(PyTypeObject *type, PyObject *args, PyObject UNUSED *kwds) {
self->count = 0; self->count = 0;
fds[0].fd = wakeup_fds[0]; fds[1].fd = signal_fds[0]; fds[0].fd = wakeup_fds[0]; fds[1].fd = signal_fds[0];
fds[0].events = POLLIN; fds[1].events = POLLIN; fds[0].events = POLLIN; fds[1].events = POLLIN;
self->repaint_delay = repaint_delay;
the_monitor = self; the_monitor = self;
return (PyObject*) self; return (PyObject*) self;
@ -175,7 +174,7 @@ dealloc(ChildMonitor* self) {
} }
static void static void
wakeup_() { wakeup_io_loop() {
while(true) { while(true) {
ssize_t ret = write(wakeup_fds[1], "w", 1); ssize_t ret = write(wakeup_fds[1], "w", 1);
if (ret < 0) { if (ret < 0) {
@ -208,7 +207,7 @@ join(ChildMonitor *self) {
static PyObject * static PyObject *
wakeup(ChildMonitor UNUSED *self) { wakeup(ChildMonitor UNUSED *self) {
#define wakeup_doc "wakeup() -> wakeup the ChildMonitor I/O thread, forcing it to exit from poll() if it is waiting there." #define wakeup_doc "wakeup() -> wakeup the ChildMonitor I/O thread, forcing it to exit from poll() if it is waiting there."
wakeup_(); wakeup_io_loop();
Py_RETURN_NONE; Py_RETURN_NONE;
} }
@ -258,7 +257,7 @@ schedule_write_to_child(unsigned long id, const char *data, size_t sz) {
screen->write_buf = PyMem_RawRealloc(screen->write_buf, screen->write_buf_sz); screen->write_buf = PyMem_RawRealloc(screen->write_buf, screen->write_buf_sz);
if (screen->write_buf == NULL) { fatal("Out of memory."); } if (screen->write_buf == NULL) { fatal("Out of memory."); }
} }
if (screen->write_buf_used) wakeup_(); if (screen->write_buf_used) wakeup_io_loop();
screen_mutex(unlock, write); screen_mutex(unlock, write);
break; break;
} }
@ -286,31 +285,26 @@ shutdown(ChildMonitor *self) {
Py_RETURN_NONE; Py_RETURN_NONE;
} }
static inline bool static inline void
do_parse(ChildMonitor *self, Screen *screen) { do_parse(ChildMonitor *self, Screen *screen, double now) {
bool updated = false;
screen_mutex(lock, read); screen_mutex(lock, read);
if (screen->read_buf_sz) { if (screen->read_buf_sz) {
parse_func(screen, self->dump_callback); double time_since_new_input = now - screen->new_input_at;
if (screen->read_buf_sz >= READ_BUF_SZ) wakeup_(); // Ensure the read fd has POLLIN set if (time_since_new_input >= OPT(input_delay)) {
screen->read_buf_sz = 0; parse_func(screen, self->dump_callback);
updated = true; if (screen->read_buf_sz >= READ_BUF_SZ) wakeup_io_loop(); // Ensure the read fd has POLLIN set
screen->read_buf_sz = 0;
screen->new_input_at = 0;
} else set_maximum_wait(OPT(input_delay) - time_since_new_input);
} }
screen_mutex(unlock, read); screen_mutex(unlock, read);
if (LIKELY(updated)) {
glfwPostEmptyEvent();
}
return updated;
} }
static double last_parse_at = -1000;
static void static void
parse_input(ChildMonitor *self) { parse_input(ChildMonitor *self) {
// Parse all available input that was read in the I/O thread. // Parse all available input that was read in the I/O thread.
size_t count = 0, remove_count = 0; size_t count = 0, remove_count = 0;
double now = monotonic(); double now = monotonic();
double time_since_last_parse = now - last_parse_at;
bool parse_needed = time_since_last_parse >= self->repaint_delay ? true : false;
children_mutex(lock); children_mutex(lock);
while (remove_queue_count) { while (remove_queue_count) {
remove_queue_count--; remove_queue_count--;
@ -321,15 +315,11 @@ parse_input(ChildMonitor *self) {
if (UNLIKELY(signal_received)) { if (UNLIKELY(signal_received)) {
glfwSetWindowShouldClose(glfw_window_id, true); glfwSetWindowShouldClose(glfw_window_id, true);
glfwPostEmptyEvent();
} else { } else {
if (parse_needed) { count = self->count;
count = self->count; for (size_t i = 0; i < count; i++) {
for (size_t i = 0; i < count; i++) { scratch[i] = children[i];
scratch[i] = children[i]; INCREF_CHILD(scratch[i]);
INCREF_CHILD(scratch[i]);
}
last_parse_at = now;
} }
} }
children_mutex(unlock); children_mutex(unlock);
@ -345,13 +335,10 @@ parse_input(ChildMonitor *self) {
for (size_t i = 0; i < count; i++) { for (size_t i = 0; i < count; i++) {
if (!scratch[i].needs_removal) { if (!scratch[i].needs_removal) {
do_parse(self, scratch[i].screen); do_parse(self, scratch[i].screen, now);
} }
DECREF_CHILD(scratch[i]); DECREF_CHILD(scratch[i]);
} }
if (!parse_needed) {
set_maximum_wait(self->repaint_delay - time_since_last_parse);
}
} }
static PyObject * static PyObject *
@ -494,9 +481,9 @@ render_cursor(Window *w, double now) {
} }
static inline bool static inline bool
render(ChildMonitor *self, double now) { render(double now) {
double time_since_last_render = now - last_render_at; double time_since_last_render = now - last_render_at;
if (time_since_last_render > self->repaint_delay) { if (time_since_last_render > OPT(repaint_delay)) {
draw_borders(); draw_borders();
#define TD global_state.tab_bar_render_data #define TD global_state.tab_bar_render_data
if (TD.screen && global_state.num_tabs > 1) draw_cells(TD.vao_idx, TD.xstart, TD.ystart, TD.dx, TD.dy, TD.screen); if (TD.screen && global_state.num_tabs > 1) draw_cells(TD.vao_idx, TD.xstart, TD.ystart, TD.dx, TD.dy, TD.screen);
@ -536,7 +523,7 @@ render(ChildMonitor *self, double now) {
glfwSwapBuffers(glfw_window_id); glfwSwapBuffers(glfw_window_id);
last_render_at = now; last_render_at = now;
} else { } else {
set_maximum_wait(self->repaint_delay - time_since_last_render); set_maximum_wait(OPT(repaint_delay) - time_since_last_render);
} }
return true; return true;
} }
@ -595,7 +582,7 @@ main_loop(ChildMonitor *self) {
while (!glfwWindowShouldClose(glfw_window_id)) { while (!glfwWindowShouldClose(glfw_window_id)) {
double now = monotonic(); double now = monotonic();
maximum_wait = -1; maximum_wait = -1;
if (!render(self, now)) break; if (!render(now)) break;
if (global_state.mouse_visible && OPT(mouse_hide_wait) > 0 && now - global_state.last_mouse_activity_at > OPT(mouse_hide_wait)) { if (global_state.mouse_visible && OPT(mouse_hide_wait) > 0 && now - global_state.last_mouse_activity_at > OPT(mouse_hide_wait)) {
glfwSetInputMode(glfw_window_id, GLFW_CURSOR, GLFW_CURSOR_HIDDEN); glfwSetInputMode(glfw_window_id, GLFW_CURSOR, GLFW_CURSOR_HIDDEN);
global_state.mouse_visible = false; global_state.mouse_visible = false;
@ -714,6 +701,7 @@ read_bytes(int fd, Screen *screen) {
break; break;
} }
if (UNLIKELY(len == 0)) return false; if (UNLIKELY(len == 0)) return false;
if (screen->new_input_at == 0) screen->new_input_at = monotonic();
screen_mutex(lock, read); screen_mutex(lock, read);
if (orig_sz != screen->read_buf_sz) { if (orig_sz != screen->read_buf_sz) {
// The other thread consumed some of the screen read buffer // The other thread consumed some of the screen read buffer
@ -828,7 +816,7 @@ io_loop(void *data) {
perror("Call to poll() failed"); perror("Call to poll() failed");
} }
} }
if (data_received) glfwPostEmptyEvent(); if (data_received) wakeup_main_loop();
} }
children_mutex(lock); children_mutex(lock);
for (i = 0; i < self->count; i++) children[i].needs_removal = true; for (i = 0; i < self->count; i++) children[i].needs_removal = true;

View File

@ -215,6 +215,7 @@ type_map = {
'cursor_opacity': to_opacity, 'cursor_opacity': to_opacity,
'open_url_modifiers': to_open_url_modifiers, 'open_url_modifiers': to_open_url_modifiers,
'repaint_delay': positive_int, 'repaint_delay': positive_int,
'input_delay': positive_int,
'window_border_width': positive_float, 'window_border_width': positive_float,
'window_margin_width': positive_float, 'window_margin_width': positive_float,
'window_padding_width': positive_float, 'window_padding_width': positive_float,

View File

@ -251,6 +251,7 @@ typedef struct {
unsigned int parser_state, parser_text_start, parser_buf_pos; unsigned int parser_state, parser_text_start, parser_buf_pos;
bool parser_has_pending_text; bool parser_has_pending_text;
uint8_t read_buf[READ_BUF_SZ], *write_buf; uint8_t read_buf[READ_BUF_SZ], *write_buf;
double new_input_at;
size_t read_buf_sz, write_buf_sz, write_buf_used; size_t read_buf_sz, write_buf_sz, write_buf_used;
pthread_mutex_t read_buf_lock, write_buf_lock; pthread_mutex_t read_buf_lock, write_buf_lock;
@ -267,7 +268,6 @@ typedef struct {
PyObject_HEAD PyObject_HEAD
PyObject *dump_callback, *update_screen, *death_notify; PyObject *dump_callback, *update_screen, *death_notify;
double repaint_delay;
unsigned int count; unsigned int count;
bool shutting_down; bool shutting_down;
pthread_t io_thread; pthread_t io_thread;

View File

@ -105,11 +105,18 @@ remember_window_size yes
initial_window_width 640 initial_window_width 640
initial_window_height 400 initial_window_height 400
# Delay (in milliseconds) between screen updates. Decreasing it, increases fps # Delay (in milliseconds) between screen updates. Decreasing it, increases
# at the cost of more CPU usage. The default value yields ~100fps which is more # frames-per-second (FPS) at the cost of more CPU usage. The default value
# than sufficient for most uses. # yields ~100 FPS which is more than sufficient for most uses.
repaint_delay 10 repaint_delay 10
# Delay (in milliseconds) before input from the program running in the terminal
# is processed. Note that decreasing it will increase responsiveness, but also
# increase CPU usage and might cause flicker in full screen programs that
# redraw the entire screen on each loop, because kitty is so fast that partial
# screen updates will be drawn.
input_delay 3
# Visual bell duration. Flash the screen when a bell occurs for the specified number of # Visual bell duration. Flash the screen when a bell occurs for the specified number of
# seconds. Set to zero to disable. # seconds. Set to zero to disable.
visual_bell_duration 0.0 visual_bell_duration 0.0

View File

@ -126,6 +126,11 @@ color_as_int(PyObject *color) {
#undef I #undef I
} }
static inline double
repaint_delay(PyObject *val) {
return (double)(PyLong_AsUnsignedLong(val)) / 1000.0;
}
#define dict_iter(d) { \ #define dict_iter(d) { \
PyObject *key, *value; Py_ssize_t pos = 0; \ PyObject *key, *value; Py_ssize_t pos = 0; \
while (PyDict_Next(d, &pos, &key, &value)) while (PyDict_Next(d, &pos, &key, &value))
@ -155,6 +160,8 @@ PYWRAP1(set_options) {
S(open_url_modifiers, PyLong_AsUnsignedLong); S(open_url_modifiers, PyLong_AsUnsignedLong);
S(click_interval, PyFloat_AsDouble); S(click_interval, PyFloat_AsDouble);
S(url_color, color_as_int); S(url_color, color_as_int);
S(repaint_delay, repaint_delay);
S(input_delay, repaint_delay);
PyObject *chars = PyObject_GetAttrString(args, "select_by_word_characters"); PyObject *chars = PyObject_GetAttrString(args, "select_by_word_characters");
if (chars == NULL) return NULL; if (chars == NULL) return NULL;

View File

@ -16,6 +16,7 @@ typedef struct {
unsigned int open_url_modifiers; unsigned int open_url_modifiers;
char_type select_by_word_characters[256]; size_t select_by_word_characters_count; char_type select_by_word_characters[256]; size_t select_by_word_characters_count;
color_type url_color; color_type url_color;
double repaint_delay, input_delay;
} Options; } Options;
typedef struct { typedef struct {