Signature matching

This commit is contained in:
Alex Ling
2021-01-19 08:43:45 +00:00
parent 7f76322377
commit 667d390be4
8 changed files with 247 additions and 40 deletions

View File

@@ -11,13 +11,14 @@ class Entry
@title = File.basename @zip_path, File.extname @zip_path
@encoded_title = URI.encode @title
@size = (File.size @zip_path).humanize_bytes
id = storage.get_id @zip_path, false
id = storage.get_entry_id @zip_path, File.signature(@zip_path)
if id.nil?
id = random_str
storage.insert_id({
path: @zip_path,
id: id,
is_title: false,
path: @zip_path,
id: id,
title_signature: nil,
entry_signature: File.signature(@zip_path).to_s,
})
end
@id = id

View File

@@ -42,16 +42,6 @@ class Library
end
end
end
db_interval = Config.current.db_optimization_interval_hours
unless db_interval < 1
spawn do
loop do
Storage.default.optimize
sleep db_interval.hours
end
end
end
end
def titles
@@ -119,6 +109,7 @@ class Library
storage.close
Logger.debug "Scan completed"
Storage.default.optimize
end
def get_continue_reading_entries(username)

View File

@@ -3,19 +3,21 @@ require "../archive"
class Title
getter dir : String, parent_id : String, title_ids : Array(String),
entries : Array(Entry), title : String, id : String,
encoded_title : String, mtime : Time, signature : UInt64 = 0
encoded_title : String, mtime : Time, signature : UInt64
@entry_display_name_cache : Hash(String, String)?
def initialize(@dir : String, @parent_id)
storage = Storage.default
id = storage.get_id @dir, true
@signature = Dir.signature dir
id = storage.get_title_id dir, signature
if id.nil?
id = random_str
storage.insert_id({
path: @dir,
id: id,
is_title: true,
path: dir,
id: id,
title_signature: signature.to_s,
entry_signature: nil,
})
end
@id = id
@@ -25,8 +27,6 @@ class Title
@entries = [] of Entry
@mtime = File.info(dir).modification_time
signatures = [] of UInt64
Dir.entries(dir).each do |fn|
next if fn.starts_with? "."
path = File.join dir, fn
@@ -35,18 +35,14 @@ class Title
next if title.entries.size == 0 && title.titles.size == 0
Library.default.title_hash[title.id] = title
@title_ids << title.id
signatures << title.signature
next
end
if [".zip", ".cbz", ".rar", ".cbr"].includes? File.extname path
entry = Entry.new path, self
@entries << entry if entry.pages > 0 || entry.err_msg
signatures << File.size entry.zip_path
end
end
@signature = Digest::CRC32.checksum(signatures.sort.join "").to_u64
mtimes = [@mtime]
mtimes += @title_ids.map { |e| Library.default.title_hash[e].mtime }
mtimes += @entries.map { |e| e.mtime }

View File

@@ -20,9 +20,11 @@ class Storage
@path : String
@db : DB::Database?
alias IDTuple = NamedTuple(path: String,
alias IDTuple = NamedTuple(
path: String,
id: String,
is_title: Bool)
entry_signature: String?,
title_signature: String?)
use_default
@@ -230,16 +232,82 @@ class Storage
end
end
def get_id(path, is_title)
def get_title_id(path, signature)
id = nil
path = Path.new(path).relative_to(Config.current.library_path).to_s
MainFiber.run do
get_db do |db|
if is_title
id = db.query_one? "select id from titles where path = (?)", path,
as: String
else
id = db.query_one? "select id from ids where path = (?)", path,
as: String
# First attempt to find the matching title in DB using BOTH path
# and signature
id = db.query_one? "select id from titles where path = (?) and " \
"signature = (?)", path, signature.to_s, as: String
should_update = id.nil?
# If it fails, try to match using the path only. This could happen
# for example when a new entry is added to the title
id ||= db.query_one? "select id from titles where path = (?)", path,
as: String
# If it still fails, we will have to rely on the signature values.
# This could happen when the user moved or renamed the title, or
# a title containing the title
unless id
# If there are multiple rows with the same signature (this could
# happen simply by bad luck, or when the user copied a title),
# pick the row that has the most similar path to the give path
rows = [] of Tuple(String, String)
db.query "select id, path from titles where signature = (?)",
signature.to_s do |rs|
rs.each do
rows << {rs.read(String), rs.read(String)}
end
end
row = rows.max_by?(&.[1].components_similarity(path))
id = row[0] if row
end
# At this point, `id` would still be nil if there's no row matching
# either the path or the signature
# If we did identify a matching title, save the path and signature
# values back to the DB
if id && should_update
db.exec "update titles set path = (?), signature = (?) " \
"where id = (?)", path, signature.to_s, id
end
end
end
id
end
# See the comments in `#get_title_id` to see how this method works.
def get_entry_id(path, signature)
id = nil
path = Path.new(path).relative_to(Config.current.library_path).to_s
MainFiber.run do
get_db do |db|
id = db.query_one? "select id from ids where path = (?) and " \
"signature = (?)", path, signature.to_s, as: String
should_update = id.nil?
id ||= db.query_one? "select id from ids where path = (?)", path,
as: String
unless id
rows = [] of Tuple(String, String)
db.query "select id, path from ids where signature = (?)",
signature.to_s do |rs|
rs.each do
rows << {rs.read(String), rs.read(String)}
end
end
row = rows.max_by?(&.[1].components_similarity(path))
id = row[0] if row
end
if id && should_update
db.exec "update ids set path = (?), signature = (?) " \
"where id = (?)", path, signature.to_s, id
end
end
end
@@ -256,11 +324,14 @@ class Storage
db.transaction do |tran|
conn = tran.connection
@@insert_ids.each do |tp|
if tp[:is_title]
conn.exec "insert into titles values (?, ?, null)", tp[:id],
tp[:path]
path = Path.new(tp[:path])
.relative_to(Config.current.library_path).to_s
if tp[:title_signature]
conn.exec "insert into titles values (?, ?, ?)", tp[:id],
path, tp[:title_signature].to_s
else
conn.exec "insert into ids values (?, ?)", tp[:path], tp[:id]
conn.exec "insert into ids values (?, ?, ?)", path, tp[:id],
tp[:entry_signature].to_s
end
end
end
@@ -363,7 +434,8 @@ class Storage
db.query "select path, id from ids" do |rs|
rs.each do
path = rs.read String
trash_ids << rs.read String unless File.exists? path
fullpath = Path.new(path).expand(Config.current.library_path).to_s
trash_ids << rs.read String unless File.exists? fullpath
end
end
@@ -377,7 +449,8 @@ class Storage
db.query "select path, id from titles" do |rs|
rs.each do
path = rs.read String
trash_titles << rs.read String unless Dir.exists? path
fullpath = Path.new(path).expand(Config.current.library_path).to_s
trash_titles << rs.read String unless Dir.exists? fullpath
end
end

50
src/util/signature.cr Normal file
View File

@@ -0,0 +1,50 @@
class File
abstract struct Info
def inode
@stat.st_ino
end
end
# Returns the signature of the file at filename.
# When it is not a supported file, returns 0. Otherwise, calculate the
# signature by combining its inode value, file size and mtime. This
# ensures that moving (unless to another device) and renaming the file
# preserves the signature, while copying or editing the file changes it.
def self.signature(filename) : UInt64
return 0u64 unless %w(.zip .rar .cbz .cbr).includes? File.extname filename
info = File.info filename
signatures = [
info.inode,
File.size(filename),
info.modification_time.to_unix,
]
Digest::CRC32.checksum(signatures.sort.join).to_u64
end
end
class Dir
# Returns the signature of the directory at dirname.
# The signature is calculated by combining its mtime and the signatures of
# all directories and files in it. This ensures that moving (unless to
# another device) and renaming the directory preserves the signature,
# while copying or editing its content changes it.
def self.signature(dirname) : UInt64
signatures = [] of (UInt64 | Int64)
signatures << File.info(dirname).modification_time.to_unix
self.open dirname do |dir|
dir.entries.each do |fn|
next if fn.starts_with? "."
path = File.join dirname, fn
if File.directory? path
signatures << Dir.signature path
else
_sig = File.signature path
# Only add its signature value to `signatures` when it is a
# supported file
signatures << _sig if _sig > 0
end
end
end
Digest::CRC32.checksum(signatures.sort.join).to_u64
end
end

View File

@@ -92,3 +92,18 @@ def sort_titles(titles : Array(Title), opt : SortOptions, username : String)
ary
end
class String
# Returns the similarity (in [0, 1]) of two paths.
# For the two paths, separate them into arrays of components, count the
# number of matching components backwards, and divide the count by the
# number of components of the shorter path.
def components_similarity(other : String) : Float64
s, l = [self, other]
.map { |str| Path.new(str).parts }
.sort_by &.size
match = s.reverse.zip(l.reverse).count { |a, b| a == b }
match / s.size
end
end