Skip to content

Commit

Permalink
Prefetching fix (anatol#28)
Browse files Browse the repository at this point in the history
The prefetching commit had some issues:
 * The insertion of all the available packages on the mirror was extremely slow cause I didn't do a batch insertion
 * The filename regex was too strict, epoch/version can include + or _, and arch field possibly too
 * The prefetch links are wrong, the "path" field is missing, I should finish fixing it in a few hours
  • Loading branch information
Focshole authored and anatol committed Sep 10, 2021
1 parent 514a6ac commit 1911e51
Show file tree
Hide file tree
Showing 7 changed files with 170 additions and 45 deletions.
18 changes: 12 additions & 6 deletions pacoloco.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ var pathRegex *regexp.Regexp
var filenameRegex *regexp.Regexp // to get the details of a package (arch, version etc)
var filenameDBRegex *regexp.Regexp // to get the filename from the db file
var urlRegex *regexp.Regexp // to extract the relevant parts from an url to compose a pacoloco url

var mirrorDBRegex *regexp.Regexp // to extract the "path" field from a url
var prefetchDB *gorm.DB

// Accepted formats
Expand All @@ -39,14 +39,14 @@ func init() {
// source: https://archlinux.org/pacman/makepkg.conf.5.html PKGEXT section
allowedPackagesExtensions = []string{".pkg.tar.gz", ".pkg.tar.bz2", ".pkg.tar.xz", ".pkg.tar.zst", ".pkg.tar.lzo", ".pkg.tar.lrz", ".pkg.tar.lz4", ".pkg.tar.lz", ".pkg.tar.Z", ".pkg.tar"}

// Filename regex explanation (also here https://regex101.com/r/qB0fQ7/34 )
// Filename regex explanation (also here https://regex101.com/r/qB0fQ7/35 )
/*
The filename relevant matches are:
^([a-z0-9._+-]+) a package filename must be a combination of lowercase letters,numbers,dots, underscores, plus symbols or dashes
- separator
([a-z0-9A-Z:.]+-[0-9]+) epoch/version. an epoch can be written as (whatever)-(sequence of numbers)
([a-z0-9A-Z:._+]+-[0-9]+) epoch/version. an epoch can be written as (whatever)-(sequence of numbers)
- separator
([a-zA-Z0-9._]+) arch
([a-zA-Z0-9:._+]+) arch
- separator
(([.]...)$ file extension, explanation below
Expand All @@ -70,7 +70,7 @@ func init() {
*/
filenameRegex, err = regexp.Compile("^([a-z0-9._+-]+)-([a-z0-9A-Z:.]+-[0-9]+)-([a-zA-Z0-9._]+)(([.]pkg[.]tar(([.]gz)|([.]bz2)|([.]xz)|([.]zst)|([.]lzo)|([.]lrz)|([.]lz4)|([.]lz)|([.]Z))?)([.]sig)?)$")
filenameRegex, err = regexp.Compile("^([a-z0-9._+-]+)-([a-zA-Z0-9:._+]+-[0-9]+)-([a-zA-Z0-9:._+]+)(([.]pkg[.]tar(([.]gz)|([.]bz2)|([.]xz)|([.]zst)|([.]lzo)|([.]lrz)|([.]lz4)|([.]lz)|([.]Z))?)([.]sig)?)$")
if err != nil {
log.Fatal(err)
} // shouldn't happen
Expand All @@ -89,6 +89,12 @@ func init() {
if err != nil {
log.Fatal(err)
} // shouldn't happen
// Starting from a string like "///extra/os/x86_64/extra.db" , it matches "///extra/os/x86_64/"
// More details here https://regex101.com/r/kMGOhq/1
mirrorDBRegex, err = regexp.Compile("^/*([^/]+/+)+")
if err != nil {
log.Fatal(err)
} // shouldn't happen
}

func main() {
Expand Down Expand Up @@ -343,7 +349,7 @@ func downloadFile(url string, filePath string, ifModifiedSince time.Time) (serve
return false, nil
default:
// for most dbs signatures are optional, be quiet if the signature is not found
// quiet a1505e9c4863f3bb3883fe7f3ee7c41f8b865876:= resp.StatusCode == http.StatusNotFound && strings.HasSuffix(url, ".db.sig")
// quiet := resp.StatusCode == http.StatusNotFound && strings.HasSuffix(url, ".db.sig")
err = fmt.Errorf("unable to download url %s, status code is %d", url, resp.StatusCode)
return false, err
}
Expand Down
46 changes: 40 additions & 6 deletions prefetch.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,11 @@ func setupPrefetchTicker() *time.Ticker {
ticker := time.NewTicker(duration) // set prefetch as specified in config file
log.Printf("The prefetching routine will be run on %v", time.Now().Add(duration))
go func() {
prefetchPackages()
for {
select {
case <-ticker.C:
lastTimeInvoked := time.Time{}
for range ticker.C {
if time.Since(lastTimeInvoked) > time.Second {
prefetchPackages()
lastTimeInvoked = time.Now()
now := time.Now()
duration, err := getPrefetchDuration(config.Prefetch.Cron, time.Now())
if err == nil && duration > 0 {
Expand All @@ -47,7 +47,7 @@ func setupPrefetchTicker() *time.Ticker {
ticker.Stop()
log.Printf("Prefetching disabled")
}
}
} // otherwise ignore it. It happened more than once that this function gets invoked twice for no reason
}
}()
return ticker
Expand Down Expand Up @@ -184,6 +184,36 @@ func cleanPrefetchDB() {
for _, pkgToDel := range deadPkgs {
purgePkgIfExists(&pkgToDel)
}
// delete mirror links which does not exist on the config file
mirrors := getAllMirrorsDB()
for _, mirror := range mirrors {
if repoLinks, exists := config.Repos[mirror.RepoName]; exists {
var URLs []string
if repoLinks.URL != "" {
URLs = append(URLs, repoLinks.URL)
} else {
URLs = repoLinks.URLs
}
// compare the mirror URL with the URLs in the config file
found := false
for _, URL := range URLs {
if strings.Contains(mirror.URL, URL) {
found = true
break
}
}
if !found {
log.Printf("Deleting %v, mirror not found on config file", mirror.URL)
deleteMirrorDBFromDB(mirror)
}

} else {
// there is no repo with that name, I delete the mirrorDB entry
log.Printf("Deleting %v, repo %v does not exist", mirror.URL, mirror.RepoName)
deleteMirrorDBFromDB(mirror)
}
}

// should be useless but this guarantees that everything got cleaned properly
_ = deleteRepoTable()
log.Printf("Db cleaned.\n")
Expand All @@ -204,7 +234,11 @@ func prefetchAllPkgs() {
for _, url := range urls {
if err := prefetchRequest(url); err == nil {
purgePkgIfExists(&pkg) // delete the old package
log.Printf("Successfully prefetched package %v-%v\n", p.PackageName, p.Arch)
if strings.HasSuffix(url, ".sig") {
log.Printf("Successfully prefetched %v-%v signature\n", p.PackageName, p.Arch)
} else {
log.Printf("Successfully prefetched %v-%v package\n", p.PackageName, p.Arch)
}
} else {
failed = append(failed, fmt.Sprintf("Failed to prefetch package at %v because %v\n", url, err))
}
Expand Down
4 changes: 4 additions & 0 deletions prefetch_db.go
Original file line number Diff line number Diff line change
Expand Up @@ -208,3 +208,7 @@ func getAllMirrorsDB() []MirrorDB {
prefetchDB.Find(&mirrorDBs)
return mirrorDBs
}

func deleteMirrorDBFromDB(m MirrorDB) {
prefetchDB.Model(&MirrorDB{}).Unscoped().Where("mirror_dbs.url = ? and mirror_dbs.repo_name = ?", m.URL, m.RepoName)
}
4 changes: 2 additions & 2 deletions prefetch_db_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -210,13 +210,13 @@ func TestGetPkgsToUpdate(t *testing.T) {
updateDBDownloadedFile("foo", "webkit-2.3.1-1-x86_64.pkg.tar.zst")
updateDBDownloadedFile("foo", "webkit2-2.3.1-1-x86_64.pkg.tar.zst")
updateDBDownloadedFile("foo", "webkit3-2.4.1-1-x86_64.pkg.tar.zst")
repoPkg, err := buildRepoPkg("webkit-2.4.1-1-x86_64.pkg.tar.zst", "foo")
repoPkg, err := buildRepoPkg("webkit-2.4.1-1-x86_64.pkg.tar.zst", "foo", "")
if err != nil {
t.Fatal(err)
}
prefetchDB.Save(&repoPkg)
// same version, shouldn't be included
repoPkg, err = buildRepoPkg("webkit3-2.4.1-1-x86_64.pkg.tar.zst", "foo")
repoPkg, err = buildRepoPkg("webkit3-2.4.1-1-x86_64.pkg.tar.zst", "foo", "")
if err != nil {
t.Fatal(err)
}
Expand Down
5 changes: 5 additions & 0 deletions prefetch_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ port: 9139
repos:
archlinux:
url: http://mirrors.kernel.org/archlinux
example:
urls:
- http://mirror1.example.org/archlinux
- https://mirror.example.com/mirror/packages/archlinux/
- http://mirror2.example.com/archlinux/test/
`
config = parseConfig([]byte(c))
return tmpDir
Expand Down
81 changes: 66 additions & 15 deletions repo_db_mirror.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ func uncompressGZ(filePath string, targetFile string) error {
}
return nil
}

func extractFilenamesFromTar(filePath string) ([]string, error) {
f, err := os.Open(filePath)
reader := bufio.NewReader(f)
Expand All @@ -67,38 +68,84 @@ func extractFilenamesFromTar(filePath string) ([]string, error) {
}
pkgName := buf.String()
matches := filenameDBRegex.FindStringSubmatch(pkgName) // find %FILENAME% and read the following string
pkgName = matches[1]
pkgList = append(pkgList, pkgName)
if len(matches) == 2 {
pkgName = matches[1]
pkgList = append(pkgList, pkgName)
} else {
log.Printf("Skipping %v cause it doesn't match regex. This is probably a bug.", hdr.Name)
continue
}
}
}
return pkgList, nil
}

// This function returns a url which should download the exactly identical pkg when sent to pacoloco except for the file extension
func getPacolocoURL(pkg Package) string {
return "/repo/" + pkg.RepoName + "/" + pkg.PackageName + "-" + pkg.Version + "-" + pkg.Arch
func getPacolocoURL(pkg Package, prefix string) string {
return strings.ReplaceAll(("/repo/" + pkg.RepoName + "/" + prefix + "/" + pkg.PackageName + "-" + pkg.Version + "-" + pkg.Arch), "//", "/")
}

// Builds a repository package
func buildRepoPkg(fileName string, repoName string) (RepoPackage, error) {
// It requires the prefix, which is the relative path in which the db is contained
func buildRepoPkg(fileName string, repoName string, prefix string) (RepoPackage, error) {
matches := filenameRegex.FindStringSubmatch(fileName)
if len(matches) >= 7 {
packageName := matches[1]
version := matches[2]
arch := matches[3]
pkg := Package{PackageName: packageName, Version: version, Arch: arch, RepoName: repoName}
pacolocoURL := getPacolocoURL(pkg)
pacolocoURL := getPacolocoURL(pkg, prefix)
return RepoPackage{PackageName: packageName, Version: version, Arch: arch, DownloadURL: pacolocoURL, RepoName: repoName}, nil
}
return RepoPackage{}, fmt.Errorf("filename %v does not match regex, matches length is %d", fileName, len(matches))
}

// Returns the "path" field from a mirror url, e.g. from
// https://mirror.example.com/mirror/packages/archlinux//extra/os/x86_64/extra.db
// it extracts /extra/os/x86_64
func getPrefixFromMirrorDB(mirror MirrorDB) (string, error) {
if repoLinks, exists := config.Repos[mirror.RepoName]; exists {
var URLs []string
if repoLinks.URL != "" {
URLs = append(URLs, repoLinks.URL)
} else {
URLs = repoLinks.URLs
}
for _, URL := range URLs {
splittedURL := strings.Split(mirror.URL, URL)
if len(splittedURL) <= 1 {
continue // this is not the proper url
}
matches := mirrorDBRegex.FindStringSubmatch(splittedURL[1])
if len(matches) < 1 {
// It means that the path is empty, e.g. //extra.db or extra.db
return "", nil
}
if !strings.HasPrefix(matches[0], "/") {
return "/" + matches[0], nil
} else {
return matches[0], nil
}

}
return "", fmt.Errorf("Error: Mirror link %v does not exist in repo %v", mirror.URL, mirror.RepoName)
} else {
// This mirror link is a residual of an old config
return "", fmt.Errorf("Error: Mirror link %v is associated with repo %v which does not exist in config.", mirror.URL, mirror.RepoName)
}
}

// Downloads the db from the mirror and adds RepoPackages
func downloadAndLoadDB(mirror MirrorDB) error {
matches := urlRegex.FindStringSubmatch(mirror.URL)
if len(matches) == 0 {
return fmt.Errorf("url '%v' is invalid, does not match path regex", mirror.URL)
}
prefix, err := getPrefixFromMirrorDB(mirror)
if err != nil {
// If a mirror is invalid, don't download & load it
return err
}

fileName := matches[4]
// create directory if it does not exist
Expand All @@ -115,35 +162,39 @@ func downloadAndLoadDB(mirror MirrorDB) error {
if _, err := downloadFile(mirror.URL, filePath, ifModifiedSince); err != nil {
return err
}

log.Printf("Extracting %v...", filePath)
// the db file exists and have been downloaded. Now it is time to decompress it
if err := uncompressGZ(filePath, filePath+".tar"); err != nil {
return err
}

// delete the original file
if err := os.Remove(filePath); err != nil {
return err
}
log.Printf("Parsing %v...", filePath+".tar")
fileList, err := extractFilenamesFromTar(filePath + ".tar") // file names are structured as name-version-subversionnumber
log.Printf("Parsed %v.", filePath+".tar")
if err != nil {
return err
}
if err := os.Remove(filePath + ".tar"); err != nil {
return err
}
log.Printf("Adding entries to db...")
var repoList []RepoPackage
for _, fileName := range fileList {
rpkg, err := buildRepoPkg(fileName, mirror.RepoName)
rpkg, err := buildRepoPkg(fileName, mirror.RepoName, prefix)
if err != nil {
// If a repo package has an invalid name
// e.g. is not a repo package, maybe it is a src package or whatever, we skip it
log.Printf("error: %v\n", err)
continue
}
prefetchDB.Save(rpkg)
repoList = append(repoList, rpkg)
}
prefetchDB.Save(&repoList)
log.Printf("Added entries to db.")
return nil

}

// download dbs from their URLs stored in the mirror_dbs table and load their content in the repo_packages table
Expand All @@ -166,9 +217,9 @@ func downloadAndLoadDbs() error {
return nil
}

func updateMirrorsDbs() {
createRepoTable()
if err := downloadAndLoadDbs(); err != nil {
log.Printf("An error occurred while downloading db files: %v", err)
func updateMirrorsDbs() error {
if err := createRepoTable(); err != nil {
return err
}
return downloadAndLoadDbs()
}
Loading

0 comments on commit 1911e51

Please sign in to comment.