Skip to content
Snippets Groups Projects
Verified Commit f4a21a99 authored by Robin Sonnabend's avatar Robin Sonnabend
Browse files

Add script to filter etherpad SQL by pad

The script expects a sql dump (created with mysqldump) and a list of pad
prefixes to filter for. The resulting SQL is written to stdout.
Use --invert to invert the match (useful for splitting a dump in two).
Entries with invalid JSON (usually old revisions and chat entries) might be dropped.
parents
No related branches found
No related tags found
No related merge requests found
*.sql
*.sql.zst
#!/usr/bin/env python3
import json
import re
import sys
PATTERN = r"\('(?P<key>(?:[^']|\\')*)','(?P<value>(?:[^']|\\')*[^\\])'\)"
p = re.compile(PATTERN)
def filter_pad(pad_id, prefixes, invert):
result = pad_id.startswith(prefixes)
if invert:
result = not result
return result
def read_value(value):
try:
return json.loads(value.encode("utf-8").decode("unicode-escape"))
except UnicodeDecodeError:
print(f"broken value {value!r}", file=sys.stderr)
raise
except json.decoder.JSONDecodeError:
print(f"Invalid JSON: {value!r}, skipping entry", file=sys.stderr)
return None
def load_file(filename, pad_prefixes, authors, invert=False):
metadata = {}
authors_known = bool(authors)
with open(filename, "r") as file:
for line in file:
if not line.startswith("INSERT INTO"):
continue
for match in p.finditer(line):
#print(match.group("key"), match.group("value"))
key = match.group("key")
value = match.group("value")
number_of_colons = key.count(":")
if number_of_colons == 0:
metadata[key] = value
elif number_of_colons == 1:
first, second = key.split(":")
if first == "pad":
if filter_pad(second, pad_prefixes, invert):
yield key, value
elif first == "pad2readonly":
if filter_pad(second, pad_prefixes, invert):
yield key, value
elif first == "readonly2pad":
pad_id = read_value(value)
if pad_id and filter_pad(pad_id, pad_prefixes, invert):
yield key, value
elif first == "globalAuthor":
if authors_known:
if second in authors:
yield key, value
elif first == "token2author":
if authors_known:
author = read_value(value)
if author and author in authors:
yield key, value
elif first == "sessionstorage":
pass
else:
raise NotImplementedError(f"Unknown key type {key!r}")
elif number_of_colons == 3:
first, second, third, fourth = key.split(":")
if first == "pad" and third == "revs":
if filter_pad(second, pad_prefixes, invert):
yield key, value
if not authors_known:
rev = read_value(value)
if rev:
authors.add(rev["meta"]["author"])
elif first == "pad" and third == "chat":
if filter_pad(second, pad_prefixes, invert):
yield key, value
else:
raise NotImplementedError(f"Unknown key type {key!r}")
else:
raise NotImplementedError(f"Unknown key type {key!r}")
def write_header(file):
file.write("DROP TABLE IF EXISTS `store`;\n")
file.write(
"""CREATE TABLE `store` (
`key` varchar(100) COLLATE utf8_bin NOT NULL DEFAULT '',
`value` longtext COLLATE utf8_bin NOT NULL,
PRIMARY KEY (`key`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin;
""")
file.write("LOCK TABLES `store` WRITE;\n")
def write_footer(file):
file.write("UNLOCK TABLES;\n")
def write_entries(file, entries):
file.write("INSERT INTO `store` VALUES ")
file.write(",".join(f"('{key}','{value}')" for key, value in entries))
file.write(";\n")
def main():
import argparse
parser = argparse.ArgumentParser(description="Filter SQL dump of etherpad DB by pad ID")
parser.add_argument("filename")
parser.add_argument("prefixes", nargs="+", help="Filter for pads starting with these prefixes")
parser.add_argument("--invert", action="store_true", help="invert the match")
parser.add_argument("--entries-per-line", type=int, default=1000)
args = parser.parse_args()
prefixes = tuple(args.prefixes)
authors = set()
# first pass: collect authors
for entry in load_file(args.filename, prefixes, authors, invert=args.invert):
pass
# second pass: get everything
write_header(sys.stdout)
cache = []
for entry in load_file(args.filename, prefixes, authors, invert=args.invert):
cache.append(entry)
if len(cache) >= args.entries_per_line:
write_entries(sys.stdout, cache)
cache = []
if cache:
write_entries(sys.stdout, cache)
write_footer(sys.stdout)
if __name__ == "__main__":
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment