From e50a8c7a142487d88fe0780c24873c1b95a2283c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Erik=20Pedersen?= Date: Wed, 27 Dec 2017 19:31:42 +0100 Subject: [PATCH] resource: Use MD5 to identify image files But only a set of byte chunks spread around in the image file to calculate the fingerprint, which is much faster than reading the whole file: ```bash BenchmarkMD5FromFileFast/full=false-4 300000 4356 ns/op 240 B/op 5 allocs/op BenchmarkMD5FromFileFast/full=true-4 30000 42899 ns/op 32944 B/op 5 allocs/op ``` Fixes #4186 --- Gopkg.lock | 6 +-- Gopkg.toml | 2 +- helpers/general.go | 53 ++++++++++++++++++++++++ helpers/general_test.go | 90 +++++++++++++++++++++++++++++++++++++++++ resource/image.go | 27 ++++++------- resource/image_test.go | 12 ++---- resource/resource.go | 12 ++++++ 7 files changed, 176 insertions(+), 26 deletions(-) diff --git a/Gopkg.lock b/Gopkg.lock index 44f957d18..4e79fd569 100644 --- a/Gopkg.lock +++ b/Gopkg.lock @@ -240,8 +240,8 @@ ".", "mem" ] - revision = "8d919cbe7e2627e417f3e45c3c0e489a5b7e2536" - version = "v1.0.0" + revision = "ec3a3111d1e1bdff38a61e09d5a5f5e974905611" + version = "v1.0.1" [[projects]] name = "github.com/spf13/cast" @@ -369,6 +369,6 @@ [solve-meta] analyzer-name = "dep" analyzer-version = 1 - inputs-digest = "2d9c34c260bc26814a0635c93009daeb9d8ffa56c29c0cff6827ae2d3e9ef96d" + inputs-digest = "7259b4caf8e75db0b809f06d4897dc870261252e3aecd68ea1348c87a5da9d50" solver-name = "gps-cdcl" solver-version = 1 diff --git a/Gopkg.toml b/Gopkg.toml index cef7427ce..cc2f66867 100644 --- a/Gopkg.toml +++ b/Gopkg.toml @@ -82,7 +82,7 @@ [[constraint]] name = "github.com/spf13/afero" - version = "1.0.0" + version = "1.0.1" [[constraint]] name = "github.com/spf13/cast" diff --git a/helpers/general.go b/helpers/general.go index 4bb4eb584..dcbac697e 100644 --- a/helpers/general.go +++ b/helpers/general.go @@ -26,6 +26,8 @@ import ( "unicode" "unicode/utf8" + "github.com/spf13/afero" + "github.com/jdkato/prose/transform" bp "github.com/gohugoio/hugo/bufferpool" @@ -372,6 +374,57 @@ func MD5String(f string) string { return hex.EncodeToString(h.Sum([]byte{})) } +// MD5FromFileFast creates a MD5 hash from the given file. It only reads parts of +// the file for speed, so don't use it if the files are very subtly different. +// It will not close the file. +func MD5FromFileFast(f afero.File) (string, error) { + const ( + // Do not change once set in stone! + maxChunks = 8 + peekSize = 64 + seek = 2048 + ) + + h := md5.New() + buff := make([]byte, peekSize) + + for i := 0; i < maxChunks; i++ { + if i > 0 { + _, err := f.Seek(seek, 0) + if err != nil { + if err == io.EOF { + break + } + return "", err + } + } + + _, err := io.ReadAtLeast(f, buff, peekSize) + if err != nil { + if err == io.EOF || err == io.ErrUnexpectedEOF { + h.Write(buff) + break + } + return "", err + } + h.Write(buff) + } + + h.Write(buff) + + return hex.EncodeToString(h.Sum(nil)), nil +} + +// MD5FromFile creates a MD5 hash from the given file. +// It will not close the file. +func MD5FromFile(f afero.File) (string, error) { + h := md5.New() + if _, err := io.Copy(h, f); err != nil { + return "", nil + } + return hex.EncodeToString(h.Sum(nil)), nil +} + // IsWhitespace determines if the given rune is whitespace. func IsWhitespace(r rune) bool { return r == ' ' || r == '\t' || r == '\n' || r == '\r' diff --git a/helpers/general_test.go b/helpers/general_test.go index 2bca632e0..6a7dd4883 100644 --- a/helpers/general_test.go +++ b/helpers/general_test.go @@ -14,10 +14,12 @@ package helpers import ( + "fmt" "reflect" "strings" "testing" + "github.com/spf13/afero" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -270,3 +272,91 @@ func TestToLowerMap(t *testing.T) { } } } + +func TestFastMD5FromFile(t *testing.T) { + fs := afero.NewMemMapFs() + + if err := afero.WriteFile(fs, "small.txt", []byte("abc"), 0777); err != nil { + t.Fatal(err) + } + + if err := afero.WriteFile(fs, "small2.txt", []byte("abd"), 0777); err != nil { + t.Fatal(err) + } + + if err := afero.WriteFile(fs, "bigger.txt", []byte(strings.Repeat("a bc d e", 100)), 0777); err != nil { + t.Fatal(err) + } + + if err := afero.WriteFile(fs, "bigger2.txt", []byte(strings.Repeat("c d e f g", 100)), 0777); err != nil { + t.Fatal(err) + } + + req := require.New(t) + + sf1, err := fs.Open("small.txt") + req.NoError(err) + sf2, err := fs.Open("small2.txt") + req.NoError(err) + + bf1, err := fs.Open("bigger.txt") + req.NoError(err) + bf2, err := fs.Open("bigger2.txt") + req.NoError(err) + + defer sf1.Close() + defer sf2.Close() + defer bf1.Close() + defer bf2.Close() + + m1, err := MD5FromFileFast(sf1) + req.NoError(err) + req.Equal("308d8a1127b46524b51507424071c22c", m1) + + m2, err := MD5FromFileFast(sf2) + req.NoError(err) + req.NotEqual(m1, m2) + + m3, err := MD5FromFileFast(bf1) + req.NoError(err) + req.NotEqual(m2, m3) + + m4, err := MD5FromFileFast(bf2) + req.NoError(err) + req.NotEqual(m3, m4) + + m5, err := MD5FromFile(bf2) + req.NoError(err) + req.NotEqual(m4, m5) +} + +func BenchmarkMD5FromFileFast(b *testing.B) { + fs := afero.NewMemMapFs() + + for _, full := range []bool{false, true} { + b.Run(fmt.Sprintf("full=%t", full), func(b *testing.B) { + for i := 0; i < b.N; i++ { + b.StopTimer() + if err := afero.WriteFile(fs, "file.txt", []byte(strings.Repeat("1234567890", 2000)), 0777); err != nil { + b.Fatal(err) + } + f, err := fs.Open("file.txt") + if err != nil { + b.Fatal(err) + } + b.StartTimer() + if full { + if _, err := MD5FromFile(f); err != nil { + b.Fatal(err) + } + } else { + if _, err := MD5FromFileFast(f); err != nil { + b.Fatal(err) + } + } + f.Close() + } + }) + } + +} diff --git a/resource/image.go b/resource/image.go index c039f68b6..916f89a48 100644 --- a/resource/image.go +++ b/resource/image.go @@ -112,6 +112,8 @@ type Image struct { imaging *Imaging + hash string + *genericResource } @@ -129,6 +131,7 @@ func (i *Image) Height() int { func (i *Image) WithNewBase(base string) Resource { return &Image{ imaging: i.imaging, + hash: i.hash, genericResource: i.genericResource.WithNewBase(base).(*genericResource)} } @@ -490,6 +493,7 @@ func (i *Image) clone() *Image { return &Image{ imaging: i.imaging, + hash: i.hash, genericResource: &g} } @@ -497,20 +501,11 @@ func (i *Image) setBasePath(conf imageConfig) { i.rel = i.filenameFromConfig(conf) } -// We need to set this to something static during tests. -var fiModTimeFunc = func(fi os.FileInfo) int64 { - return fi.ModTime().Unix() -} - func (i *Image) filenameFromConfig(conf imageConfig) string { p1, p2 := helpers.FileAndExt(i.rel) - sizeModeStr := fmt.Sprintf("_S%d_T%d", i.osFileInfo.Size(), fiModTimeFunc(i.osFileInfo)) - // On scaling an already scaled image, we get the file info from the original. - // Repeating the same info in the filename makes it stuttery for no good reason. - if strings.Contains(p1, sizeModeStr) { - sizeModeStr = "" - } + idStr := fmt.Sprintf("_H%s_%d", i.hash, i.osFileInfo.Size()) + // Do not change for no good reason. const md5Threshold = 100 key := conf.key() @@ -518,12 +513,16 @@ func (i *Image) filenameFromConfig(conf imageConfig) string { // It is useful to have the key in clear text, but when nesting transforms, it // can easily be too long to read, and maybe even too long // for the different OSes to handle. - if len(p1)+len(sizeModeStr)+len(p2) > md5Threshold { + if len(p1)+len(idStr)+len(p2) > md5Threshold { key = helpers.MD5String(p1 + key + p2) - p1 = p1[:strings.Index(p1, "_S")] + p1 = p1[:strings.Index(p1, "_H")] + } else if strings.Contains(p1, idStr) { + // On scaling an already scaled image, we get the file info from the original. + // Repeating the same info in the filename makes it stuttery for no good reason. + idStr = "" } - return fmt.Sprintf("%s%s_%s%s", p1, sizeModeStr, key, p2) + return fmt.Sprintf("%s%s_%s%s", p1, idStr, key, p2) } func decodeImaging(m map[string]interface{}) (Imaging, error) { diff --git a/resource/image_test.go b/resource/image_test.go index 3543abb37..61a9ef844 100644 --- a/resource/image_test.go +++ b/resource/image_test.go @@ -15,7 +15,6 @@ package resource import ( "fmt" - "os" "testing" "github.com/stretchr/testify/require" @@ -52,9 +51,6 @@ func TestParseImageConfig(t *testing.T) { } func TestImageTransform(t *testing.T) { - fiModTimeFunc = func(fi os.FileInfo) int64 { - return int64(10111213) - } assert := require.New(t) @@ -86,13 +82,13 @@ func TestImageTransform(t *testing.T) { assert.Equal(200, resizedAndRotated.Height()) assertFileCache(assert, image.spec.Fs, resizedAndRotated.RelPermalink(), 125, 200) - assert.Equal("/a/sunset_S90587_T10111213_300x200_resize_q75_box_center.jpg", resized.RelPermalink()) + assert.Equal("/a/sunset_H47566bb0ca0462db92c65f4033d77175_90587_300x200_resize_q75_box_center.jpg", resized.RelPermalink()) assert.Equal(300, resized.Width()) assert.Equal(200, resized.Height()) fitted, err := resized.Fit("50x50") assert.NoError(err) - assert.Equal("/a/sunset_S90587_T10111213_300x200_resize_q75_box_center_50x50_fit_q75_box_center.jpg", fitted.RelPermalink()) + assert.Equal("/a/sunset_H47566bb0ca0462db92c65f4033d77175_90587_9b37eba4e4e6ea0cc56a59bb5aa98143.jpg", fitted.RelPermalink()) assert.Equal(50, fitted.Width()) assert.Equal(31, fitted.Height()) @@ -100,13 +96,13 @@ func TestImageTransform(t *testing.T) { fittedAgain, _ := fitted.Fit("10x20") fittedAgain, err = fittedAgain.Fit("10x20") assert.NoError(err) - assert.Equal("/a/sunset_f1fb715a17c42d5d4602a1870424d590.jpg", fittedAgain.RelPermalink()) + assert.Equal("/a/sunset_H47566bb0ca0462db92c65f4033d77175_90587_9a8be1402216c385e0dfd73e267c6827.jpg", fittedAgain.RelPermalink()) assert.Equal(10, fittedAgain.Width()) assert.Equal(6, fittedAgain.Height()) filled, err := image.Fill("200x100 bottomLeft") assert.NoError(err) - assert.Equal("/a/sunset_S90587_T10111213_200x100_fill_q75_box_bottomleft.jpg", filled.RelPermalink()) + assert.Equal("/a/sunset_H47566bb0ca0462db92c65f4033d77175_90587_200x100_fill_q75_box_bottomleft.jpg", filled.RelPermalink()) assert.Equal(200, filled.Width()) assert.Equal(100, filled.Height()) assertFileCache(assert, image.spec.Fs, filled.RelPermalink(), 200, 100) diff --git a/resource/resource.go b/resource/resource.go index 2c934d031..19392f3d3 100644 --- a/resource/resource.go +++ b/resource/resource.go @@ -153,7 +153,19 @@ func (r *Spec) newResource( gr := r.newGenericResource(linker, fi, absPublishDir, absSourceFilename, filepath.ToSlash(relTargetFilename), mimeType) if mimeType == "image" { + f, err := r.Fs.Source.Open(absSourceFilename) + if err != nil { + return nil, err + } + defer f.Close() + + hash, err := helpers.MD5FromFileFast(f) + if err != nil { + return nil, err + } + return &Image{ + hash: hash, imaging: r.imaging, genericResource: gr}, nil }