CEPH PG deep-scrub cron

Rationale

CEPH deep-scrubbing is sometimes brutal — PGs eventually go beyond the max deep scrub interval, and everyone happily go into deepscrub at the exact same time. This most probably happens at the worst time (hello there Murphy’s), bringing your available IO to its knees when you need it the most (which is practically “anytime”, right ?).

To avoid this, you can plan your deep-scrubing at optimal times regarding your business rules. The idea here is to set your OSD’s osd_deep_scrub_interval values high enough so it never really kicks in, and have your PGs deep-scrubbed when time seems right on a shorter period; for instance:

  •  osd deep scrub interval = 2419200 #That’s 4 weeks
  • Deep-scrub everything with a  1 week period.

To further spread the load, deep-scrub a given share of your PGs at a time, with no more than a given amount of deep-scrubbing tasks in parallel; i.e deep-scrub the “oldest” 1/7th of your PGs once a day at never more than 2 parallel scrubs at a time.

The script below does just that.

It does require python to parse ceph JSON output. Adapt the “MAXSCRUBS” variable for parallelism, call it with an argument, defining the share of all active PG to be scrubbed in one run, or just use the default value of 7, call it from a cron and you’re good to go.

Don’t forget to increase your osd_deep_scrub_interval so Ceph by itself does not mess with your careful planing.

CEPH: planned deep-scrubbing script

#!/bin/bash
CEPH=/usr/bin/ceph
AWK=/usr/bin/awk
SORT=/usr/bin/sort
HEAD=/usr/bin/head
DATE=/bin/date
SED=/bin/sed
GREP=/bin/grep
PYTHON=/usr/bin/python

#What string does match a deep scrubing state in ceph pg's output?
DEEPMARK="scrubbing+deep"
#Max concurrent deep scrubs operations
MAXSCRUBS=2

startdate=$($DATE +%s)
cat << __EOM__

***************************
* CEPH DEEP-SCRUB CRONJOB *
***************************

Started working:
    $($DATE -R)
    TS: $startdate

__EOM__

#Set work ratio from first arg; fall back to '7'.
workratio=$1
[ "x$workratio" == x ] && workratio=7

function isNewerThan() {
    # Args: [PG] [TIMESTAMP]
    # Output: None
    # Returns: 0 if changed; 1 otherwise
    # Desc: Check if a placement group "PG" deep scrub stamp has changed 
    # (i.e != "TIMESTAMP")
    pg=$1
    ots=$2
    ndate=$($CEPH pg $pg query -f json-pretty | \
        $PYTHON -c 'import json;import sys; print json.loads(sys.stdin.read())["info"]["stats"]["last_deep_scrub_stamp"]')
    nts=$($DATE -d "$ndate" +%s)
    [ $ots -ne $nts ] && return 0
    return 1
}

function scrubbingCount() {
    # Args: None
    # Output: int
    # Returns: 0
    # Desc: Outputs concurent deep scrubbing tasks.
    cnt=$($CEPH -s | $GREP $DEEPMARK | $AWK '{ print $1; }')
    [ "x$cnt" == x ] && cnt=0
    echo $cnt
    return 0
}

function waitForScrubSlot() {
    # Args: None
    # Output: Informative text
    # Returns: true
    # Desc: Idle loop waiting for a free deepscrub slot.
    scount=$(scrubbingCount)
    if [ $scount -ge $MAXSCRUBS ]; then
        echo -n "Waiting for a scrub slot to free up... "
    fi
    while [ $(scrubbingCount) -ge $MAXSCRUBS ]; do
        sleep 1
    done
    echo 'OK'
    return 0
}

function deepScrubPg() {
    # Args: [PG]
    # Output: Informative text
    # Return: 0 when PG is effectively deep scrubing
    # Desc: Start a PG "PG" deep-scrub
    echo Scrubbing $1
    $CEPH pg deep-scrub $1
    #Must sleep as ceph does not immediately start scrubbing
    #So we wait until wanted PG effectively goes into deep scrubbing state...
    echo -n 'Waiting for PG to get into scrub mode... '
    while ! $CEPH pg $1 query | $GREP state | $GREP -q $DEEPMARK; do
        isNewerThan $1 $2 && echo -n "[PG scrub stamp got updated] " && break
        sleep 1
    done
    echo 'OK'
    echo
    return 0
}
    

function getOldestScrubs() {
    # Args: [num_res]
    # Output: [num_res] PG ids
    # Return: 0
    # Desc: Get the "num_res" oldest deep-scrubbed PGs
    numres=$1
    [ x$numres == x ] && numres=20
    $CEPH pg dump summary 2>/dev/null | \
        $AWK '/^[0-9a-z]\.[0-9a-z]+/ { if($9 == "active+clean") {  print $1,$20,$21 ; }; }' | \
        while read line; do set $line; echo $1 $($DATE -d "$2 $3" +%s); done | \
        $SORT -n -k2  | \
        $HEAD -n $numres
    return 0
}

function getPgCount() {
    # Args:
    # Output: number of total PGs
    # Desc: Output the total number of "active+clean" PGs
    $CEPH pg stat | $SED 's/^.* \([0-9]\+\) active+clean[^+].*/\1/g'
}

#Get PG count
pgcnt=$(getPgCount)
#Get the number of PGs we'll be working on
pgwork=$((pgcnt / workratio + 1))

#Actual work starts here, quite self-explanatory.
echo "About to scrub 1/${workratio} of $pgcnt PGs = $pgwork PGs to scrub"
getOldestScrubs $pgwork | while read line; do
    now=$($DATE +%s)
    set $line
    age=$(expr $now - $2)
    days=$((age / 3600 / 24))
    echo "PG $1 is $days days - $age seconds old"
    waitForScrubSlot
    deepScrubPg $1 $2
done

echo "Finished batch"
enddate=$($DATE +%s)
echo "***EndOfRun $($DATE -R) // took $((enddate - startdate)) seconds"

 

Mydumper: subdirectories mode patch

Rationale

Whether you run a “small” MySQL instance, with a few dozens of Gigabytes data directory and tenth of thousands tables, or a small instance, you’ve probably been bothered with the way mydumper manages output. That is, a sequence of files with no structure. Just to make things clear; mydumper is a great piece of software, especially when you run a mix of InnoDB and MyISAM tables in a master-master scenario, where backup consistency is vital.

In my case it did however bit hard when wanting to replay a backup for a specific table. It did take quite some time to the backup solution just to display the file list within the archive…

The patch below changes just that; it adds a “-d” (or “–subdir“) switch turning on “subdir mode” to mydumper. You get one sub-directory per database, with all related table-schema and table-data files within that directory. Myloader of course also gets updated to read subdir-based backups.

The patch applies to mydumper-0.6.2.

Downloadmydumper-0.6.2-probesys+subdirs.patch

The mydumper patch itself

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cffc879..d7b674f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 2.6)
 project(mydumper)
-set(VERSION 0.6.2)
+set(VERSION "0.6.2-probesys+subdirs")
 set(ARCHIVE_NAME "${CMAKE_PROJECT_NAME}-${VERSION}")
 
 #Required packages
diff --git a/mydumper.c b/mydumper.c
index 3663e6b..2c0638e 100644
--- a/mydumper.c
+++ b/mydumper.c
@@ -71,6 +71,7 @@ int detected_server= 0;
 int lock_all_tables=0;
 guint snapshot_interval= 60;
 gboolean daemon_mode= FALSE;
+gboolean subdir_mode= FALSE;
 
 gchar *ignore_engines= NULL;
 char **ignore= NULL;
@@ -133,6 +134,7 @@ static GOptionEntry entries[] =
 	{ "binlogs", 'b', 0, G_OPTION_ARG_NONE, &need_binlogs, "Get a snapshot of the binary logs as well as dump data",  NULL },
 #endif
 	{ "daemon", 'D', 0, G_OPTION_ARG_NONE, &daemon_mode, "Enable daemon mode", NULL },
+	{ "subdir", 'd', 0, G_OPTION_ARG_NONE, &subdir_mode, "Enable subdir mode - one subdir per DB", NULL },
 	{ "snapshot-interval", 'I', 0, G_OPTION_ARG_INT, &snapshot_interval, "Interval between each dump snapshot (in minutes), requires --daemon, default 60", NULL },
 	{ "logfile", 'L', 0, G_OPTION_ARG_FILENAME, &logfile, "Log file name to use, by default stdout is used", NULL },
 	{ "tz-utc", 0, 0, G_OPTION_ARG_NONE, NULL, "SET TIME_ZONE='+00:00' at top of dump to allow dumping of TIMESTAMP data when a server has data in different time zones or data is being moved between servers with different time zones, defaults to on use --skip-tz-utc to disable.", NULL },
@@ -152,6 +154,7 @@ void dump_table(MYSQL *conn, char *database, char *table, struct configuration *
 void dump_tables(MYSQL *, GList *, struct configuration *);
 guint64 dump_table_data(MYSQL *, FILE *, char *, char *, char *, char *);
 void dump_database(MYSQL *, char *);
+int prepare_subdir(char *database);
 void get_tables(MYSQL * conn);
 GList * get_chunks_for_table(MYSQL *, char *, char*,  struct configuration *conf);
 guint64 estimate_count(MYSQL *conn, char *database, char *table, char *field, char *from, char *to);
@@ -1467,8 +1470,20 @@ void create_backup_dir(char *new_directory) {
 	}
 }
 
-void dump_database(MYSQL * conn, char *database) {
+int prepare_subdir(char *database) {
+	if (!subdir_mode) return 0;
+	int ret;
+	char *p;
+	if (daemon_mode)
+		p = g_strdup_printf("%s/%d/%s", output_directory, dump_number, database);
+	else
+		p = g_strdup_printf("%s/%s", output_directory, database);
+	ret = g_mkdir_with_parents(p, 0755);
+	return ret;
+}
 
+void dump_database(MYSQL * conn, char *database) {
+	if (subdir_mode) prepare_subdir(database);
 	char *query;
 	mysql_select_db(conn,database);
 	if (detected_server == SERVER_TYPE_MYSQL)
@@ -1692,6 +1707,7 @@ void dump_schema_data(MYSQL *conn, char *database, char *table, char *filename)
 }
 
 void dump_table_data_file(MYSQL *conn, char *database, char *table, char *where, char *filename) {
+	if (subdir_mode) prepare_subdir(database);
 	void *outfile;
 
 	if (!compress_output)
@@ -1719,9 +1735,9 @@ void dump_schema(char *database, char *table, struct configuration *conf) {
 	j->conf=conf;
 	j->type=JOB_SCHEMA;
 	if (daemon_mode)
-		sj->filename = g_strdup_printf("%s/%d/%s.%s-schema.sql%s", output_directory, dump_number, database, table, (compress_output?".gz":""));
+		sj->filename = g_strdup_printf("%s/%d/%s%s%s-schema.sql%s", output_directory, dump_number, database, (subdir_mode?"/":"."), table, (compress_output?".gz":""));
 	else
-		sj->filename = g_strdup_printf("%s/%s.%s-schema.sql%s", output_directory, database, table, (compress_output?".gz":""));
+		sj->filename = g_strdup_printf("%s/%s%s%s-schema.sql%s", output_directory, database, (subdir_mode?"/":"."), table, (compress_output?".gz":""));
 	g_async_queue_push(conf->queue,j);
 	return;
 }
@@ -1744,9 +1760,9 @@ void dump_table(MYSQL *conn, char *database, char *table, struct configuration *
 			j->conf=conf;
 			j->type= is_innodb ? JOB_DUMP : JOB_DUMP_NON_INNODB;
 			if (daemon_mode)
-				tj->filename=g_strdup_printf("%s/%d/%s.%s.%05d.sql%s", output_directory, dump_number, database, table, nchunk,(compress_output?".gz":""));
+				tj->filename=g_strdup_printf("%s/%d/%s%s%s.%05d.sql%s", output_directory, dump_number, database, (subdir_mode?"/":"."), table, nchunk,(compress_output?".gz":""));
 			else
-				tj->filename=g_strdup_printf("%s/%s.%s.%05d.sql%s", output_directory, database, table, nchunk,(compress_output?".gz":""));
+				tj->filename=g_strdup_printf("%s/%s%s%s.%05d.sql%s", output_directory, database, (subdir_mode?"/":"."), table, nchunk,(compress_output?".gz":""));
 			tj->where=(char *)chunks->data;
 			if (!is_innodb && nchunk)
 				g_atomic_int_inc(&non_innodb_table_counter);
@@ -1763,9 +1779,9 @@ void dump_table(MYSQL *conn, char *database, char *table, struct configuration *
 		j->conf=conf;
 		j->type= is_innodb ? JOB_DUMP : JOB_DUMP_NON_INNODB;
 		if (daemon_mode)
-			tj->filename = g_strdup_printf("%s/%d/%s.%s%s.sql%s", output_directory, dump_number, database, table,(chunk_filesize?".00001":""),(compress_output?".gz":""));
+			tj->filename = g_strdup_printf("%s/%d/%s%s%s%s.sql%s", output_directory, dump_number, database, (subdir_mode?"/":"."), table,(chunk_filesize?".00001":""),(compress_output?".gz":""));
 		else
-			tj->filename = g_strdup_printf("%s/%s.%s%s.sql%s", output_directory, database, table,(chunk_filesize?".00001":""),(compress_output?".gz":""));
+			tj->filename = g_strdup_printf("%s/%s%s%s%s.sql%s", output_directory, database, (subdir_mode?"/":"."), table,(chunk_filesize?".00001":""),(compress_output?".gz":""));
 		g_async_queue_push(conf->queue,j);
 		return;
 	}
@@ -1794,9 +1810,9 @@ void dump_tables(MYSQL *conn, GList *noninnodb_tables_list, struct configuration
 				tj->database = g_strdup_printf("%s",dbt->database);
 				tj->table = g_strdup_printf("%s",dbt->table);
 				if (daemon_mode)
-					tj->filename=g_strdup_printf("%s/%d/%s.%s.%05d.sql%s", output_directory, dump_number, dbt->database, dbt->table, nchunk,(compress_output?".gz":""));
+					tj->filename=g_strdup_printf("%s/%d/%s%s%s.%05d.sql%s", output_directory, dump_number, dbt->database, (subdir_mode?"/":"."), dbt->table, nchunk,(compress_output?".gz":""));
 				else
-					tj->filename=g_strdup_printf("%s/%s.%s.%05d.sql%s", output_directory, dbt->database, dbt->table, nchunk,(compress_output?".gz":""));
+					tj->filename=g_strdup_printf("%s/%s%s%s.%05d.sql%s", output_directory, dbt->database, (subdir_mode?"/":"."), dbt->table, nchunk,(compress_output?".gz":""));
 				tj->where=(char *)chunks->data;
 				tjs->table_job_list= g_list_append(tjs->table_job_list, tj);
 				nchunk++;
@@ -1806,9 +1822,9 @@ void dump_tables(MYSQL *conn, GList *noninnodb_tables_list, struct configuration
 			tj->database = g_strdup_printf("%s",dbt->database);
 			tj->table = g_strdup_printf("%s",dbt->table);
 			if (daemon_mode)
-				tj->filename = g_strdup_printf("%s/%d/%s.%s%s.sql%s", output_directory, dump_number, dbt->database, dbt->table,(chunk_filesize?".00001":""),(compress_output?".gz":""));
+				tj->filename = g_strdup_printf("%s/%d/%s%s%s%s.sql%s", output_directory, dump_number, dbt->database, (subdir_mode?"/":"."), dbt->table,(chunk_filesize?".00001":""),(compress_output?".gz":""));
 			else
-				tj->filename = g_strdup_printf("%s/%s.%s%s.sql%s", output_directory, dbt->database, dbt->table,(chunk_filesize?".00001":""),(compress_output?".gz":""));
+				tj->filename = g_strdup_printf("%s/%s%s%s%s.sql%s", output_directory, dbt->database, (subdir_mode?"/":"."), dbt->table,(chunk_filesize?".00001":""),(compress_output?".gz":""));
 			tj->where = NULL;
 			tjs->table_job_list= g_list_append(tjs->table_job_list, tj);
 		}
diff --git a/myloader.c b/myloader.c
index 927fd39..df97479 100644
--- a/myloader.c
+++ b/myloader.c
@@ -24,6 +24,7 @@
 #include <string.h>
 #include <glib.h>
 #include <glib/gstdio.h>
+#include <sys/stat.h>
 #include <stdlib.h>
 #include <stdarg.h>
 #include <errno.h>
@@ -42,6 +43,8 @@ static GMutex *init_mutex= NULL;
 guint errors= 0;
 
 gboolean read_data(FILE *file, gboolean is_compressed, GString *data, gboolean *eof);
+char *make_rel_path(char *tdir, const char *filename);
+void restore_databases_from_directory(char *tdirectory, struct configuration *conf, MYSQL *conn);
 void restore_data(MYSQL *conn, char *database, char *table, const char *filename, gboolean is_schema);
 void *process_queue(struct thread_data *td);
 void add_table(const gchar* filename, struct configuration *conf);
@@ -173,12 +176,23 @@ int main(int argc, char *argv[]) {
 	return errors ? EXIT_FAILURE : EXIT_SUCCESS;
 }
 
-void restore_databases(struct configuration *conf, MYSQL *conn) {
+
+char *make_rel_path(char *tdir, const char *filename) {
+	gchar** split_file= g_strsplit(tdir, directory, 0);
+	char* rel = split_file[1];
+	if (g_str_has_prefix(rel, "/")) rel++;
+	char* relpath = g_strdup_printf("%s%s%s", rel, (strlen(rel)>0 ? "/" : ""), filename);
+	g_strfreev(split_file);
+	return relpath;
+}
+
+void restore_databases_from_directory(char *tdirectory, struct configuration *conf, MYSQL *conn) {
 	GError *error= NULL;
-	GDir* dir= g_dir_open(directory, 0, &error);
+	GDir* dir= g_dir_open(tdirectory, 0, &error);
+	struct stat *mstat = g_new(struct stat, 1);
 
 	if (error) {
-		g_critical("cannot open directory %s, %s\n", directory, error->message);
+		g_critical("cannot open directory %s, %s\n", tdirectory, error->message);
 		errors++;
 		return;
 	}
@@ -187,7 +201,7 @@ void restore_databases(struct configuration *conf, MYSQL *conn) {
 
 	while((filename= g_dir_read_name(dir))) {
 		if (g_strrstr(filename, "-schema.sql")) {
-			add_schema(filename, conn);
+			add_schema(make_rel_path(tdirectory, filename), conn);
 		}
 	}
 
@@ -195,16 +209,43 @@ void restore_databases(struct configuration *conf, MYSQL *conn) {
 
 	while((filename= g_dir_read_name(dir))) {
 		if (!g_strrstr(filename, "-schema.sql") && g_strrstr(filename, ".sql")) {
-			add_table(filename, conf);
+			add_table(make_rel_path(tdirectory, filename), conf);
 		}
 	}
 
+	g_dir_rewind(dir);
+
+	while((filename= g_dir_read_name(dir))) {
+		gchar* path= g_build_filename(tdirectory, filename, NULL);
+		if (g_stat(path, mstat) < 0) {
+			g_critical("cannot stat file %s\n", filename);
+			errors++;
+			g_free(path);
+			return;
+		}
+		if (S_ISDIR(mstat->st_mode)) {
+			restore_databases_from_directory(path, conf, conn);
+			path=NULL;
+		}
+		g_free(path);
+	}
+	g_free(mstat);
 	g_dir_close(dir);
 }
 
+void restore_databases(struct configuration *conf, MYSQL *conn) {
+	return restore_databases_from_directory(directory, conf, conn);
+}
+
 void add_schema(const gchar* filename, MYSQL *conn) {
 	// 0 is database, 1 is table with -schema on the end
-	gchar** split_file= g_strsplit(filename, ".", 0);
+	gchar** split_file = NULL;
+	if (g_strrstr(filename, "/")) {
+		split_file = g_strsplit(filename, "/", 0);
+	}
+	else {
+		split_file= g_strsplit(filename, ".", 0);
+	}
 	gchar* database= split_file[0];
 	// Remove the -schema from the table name
 	gchar** split_table= g_strsplit(split_file[1], "-schema", 0);
@@ -251,7 +292,13 @@ void add_table(const gchar* filename, struct configuration *conf) {
 	j->job_data= (void*) rj;
 	rj->filename= g_strdup(filename);
 	j->type= JOB_RESTORE;
-	gchar** split_file= g_strsplit(filename, ".", 0);
+	gchar** split_file = NULL;
+	if (g_strrstr(filename, "/")) {
+		split_file = g_strsplit_set(filename, "/.", 0);
+	}
+	else {
+		split_file= g_strsplit(filename, ".", 0);
+	}
 	rj->database= g_strdup(split_file[0]);
 	rj->table= g_strdup(split_file[1]);
 	rj->part= g_ascii_strtoull(split_file[2], NULL, 10);
mydumper-0.6.2-probeys+subdirs

Kindly provided by Probesys.