[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
File search
From: |
Ludovic Courtès |
Subject: |
File search |
Date: |
Fri, 21 Jan 2022 10:03:43 +0100 |
User-agent: |
Gnus/5.13 (Gnus v5.13) Emacs/27.2 (gnu/linux) |
Hello Guix!
Lately I found myself going several times to
<https://packages.debian.org> to look for packages providing a given
file and I thought it’s time to do something about it.
The script below creates an SQLite database for the current set of
packages, but only for those already in the store:
guix repl file-database.scm populate
That creates /tmp/db; it took about 25mn on berlin, for 18K packages.
Then you can run, say:
guix repl file-database.scm search boot-9.scm
to find which packages provide a file named ‘boot-9.scm’. That part is
instantaneous.
The database for 18K packages is quite big:
--8<---------------cut here---------------start------------->8---
$ du -h /tmp/db*
389M /tmp/db
82M /tmp/db.gz
61M /tmp/db.zst
--8<---------------cut here---------------end--------------->8---
How do we expose that information? There are several criteria I can
think of: accuracy, freshness, privacy, responsiveness, off-line
operation.
I think accuracy (making sure you get results that correspond precisely
to, say, your current channel revisions and your current system) is not
a high priority: some result is better than no result. Likewise for
freshness: results for an older version of a given package may still be
valid now.
In terms of privacy, I think it’s better if we can avoid making one
request per file searched for. Off-line operation would be sweet, and
it comes with responsiveness; fast off-line search is necessary for
things like ‘command-not-found’ (where the shell tells you what package
to install when a command is not found).
Based on that, it is tempting to just distribute a full database from
ci.guix, say, that the client command would regularly fetch. The
downside is that that’s quite a lot of data to download; if you use the
file search command infrequently, you might find yourself spending more
time downloading the database than actually searching it.
We could have a hybrid solution: distribute a database that contains
only files in /bin and /sbin (it should be much smaller), and for
everything else, resort to a web service (the Data Service could be
extended to include file lists). That way, we’d have fast
privacy-respecting search for command names, and on-line search for
everything else.
Thoughts?
Ludo’.
;;; GNU Guix --- Functional package management for GNU
;;; Copyright © 2022 Ludovic Courtès <ludo@gnu.org>
;;;
;;; This file is part of GNU Guix.
;;;
;;; GNU Guix is free software; you can redistribute it and/or modify it
;;; under the terms of the GNU General Public License as published by
;;; the Free Software Foundation; either version 3 of the License, or (at
;;; your option) any later version.
;;;
;;; GNU Guix is distributed in the hope that it will be useful, but
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;;; GNU General Public License for more details.
;;;
;;; You should have received a copy of the GNU General Public License
;;; along with GNU Guix. If not, see <http://www.gnu.org/licenses/>.
(define-module (file-database)
#:use-module (sqlite3)
#:use-module (ice-9 match)
#:use-module (guix store)
#:use-module (guix monads)
#:autoload (guix grafts) (%graft?)
#:use-module (guix derivations)
#:use-module (guix packages)
#:autoload (guix build utils) (find-files)
#:autoload (gnu packages) (fold-packages)
#:use-module (srfi srfi-1)
#:use-module (srfi srfi-9)
#:export (file-database))
(define schema
"
create table if not exists Packages (
id integer primary key autoincrement not null,
name text not null,
version text not null
);
create table if not exists Directories (
id integer primary key autoincrement not null,
name text not null,
package integer not null,
foreign key (package) references Packages(id) on delete cascade
);
create table if not exists Files (
name text not null,
basename text not null,
directory integer not null,
foreign key (directory) references Directories(id) on delete cascade
);
create index if not exists IndexFiles on Files(basename);")
(define (call-with-database file proc)
(let ((db (sqlite-open file)))
(dynamic-wind
(lambda () #t)
(lambda ()
(sqlite-exec db schema)
(proc db))
(lambda ()
(sqlite-close db)))))
(define (insert-files db package version directories)
"Insert the files contained in DIRECTORIES as belonging to PACKAGE at
VERSION."
(define last-row-id-stmt
(sqlite-prepare db "SELECT last_insert_rowid();"
#:cache? #t))
(define package-stmt
(sqlite-prepare db "\
INSERT OR REPLACE INTO Packages(name, version)
VALUES (:name, :version);"
#:cache? #t))
(define directory-stmt
(sqlite-prepare db "\
INSERT INTO Directories(name, package) VALUES (:name, :package);"
#:cache? #t))
(define file-stmt
(sqlite-prepare db "\
INSERT INTO Files(name, basename, directory)
VALUES (:name, :basename, :directory);"
#:cache? #t))
(sqlite-exec db "begin immediate;")
(sqlite-bind-arguments package-stmt
#:name package
#:version version)
(sqlite-fold (const #t) #t package-stmt)
(match (sqlite-fold cons '() last-row-id-stmt)
((#(package-id))
(pk 'package package-id package)
(for-each (lambda (directory)
(define (strip file)
(string-drop file (+ (string-length directory) 1)))
(sqlite-reset directory-stmt)
(sqlite-bind-arguments directory-stmt
#:name directory
#:package package-id)
(sqlite-fold (const #t) #t directory-stmt)
(match (sqlite-fold cons '() last-row-id-stmt)
((#(directory-id))
(for-each (lambda (file)
;; If DIRECTORY is a symlink, (find-files
;; DIRECTORY) returns the DIRECTORY singleton.
(unless (string=? file directory)
(sqlite-reset file-stmt)
(sqlite-bind-arguments file-stmt
#:name (strip file)
#:basename
(basename file)
#:directory
directory-id)
(sqlite-fold (const #t) #t file-stmt)))
(find-files directory)))))
directories)
(sqlite-exec db "commit;"))))
(define (insert-package db package)
"Insert all the files of PACKAGE into DB."
(mlet %store-monad ((drv (package->derivation package #:graft? #f)))
(match (derivation->output-paths drv)
(((labels . directories) ...)
(when (every file-exists? directories)
(insert-files db (package-name package) (package-version package)
directories))
(return #t)))))
(define (insert-packages db)
"Insert all the current packages into DB."
(with-store store
(parameterize ((%graft? #f))
(fold-packages (lambda (package _)
(run-with-store store
(insert-package db package)))
#t
#:select? (lambda (package)
(and (not (hidden-package? package))
(not (package-superseded package))
(supported-package? package)))))))
(define-record-type <package-match>
(package-match name version file)
package-match?
(name package-match-name)
(version package-match-version)
(file package-match-file))
(define (matching-packages db file)
"Return a list of <package-match> corresponding to packages containing
FILE."
(define lookup-stmt
(sqlite-prepare db "\
SELECT Packages.name, Packages.version, Directories.name, Files.name
FROM Packages
INNER JOIN Files, Directories
ON files.basename = :file AND directories.id = files.directory AND packages.id
= directories.package;"))
(sqlite-bind-arguments lookup-stmt #:file file)
(sqlite-fold (lambda (result lst)
(match result
(#(package version directory file)
(cons (package-match package version
(string-append directory "/" file))
lst))))
'() lookup-stmt))
(define (file-database . args)
(match args
((_ "populate")
(call-with-database "/tmp/db"
(lambda (db)
(insert-packages db))))
((_ "search" file)
(let ((matches (call-with-database "/tmp/db"
(lambda (db)
(matching-packages db file)))))
(for-each (lambda (result)
(format #t "~20a ~a~%"
(string-append (package-match-name result)
"@" (package-match-version result))
(package-match-file result)))
matches)
(exit (pair? matches))))
(_
(format (current-error-port)
"usage: file-database [populate|search] args ...~%")
(exit 1))))
(apply file-database (command-line))
- File search,
Ludovic Courtès <=
Re: File search, raingloom, 2022/01/22