duplicity-team team mailing list archive

Thread
Date

[Merge] lp:~aaron-whitehouse/duplicity/split_glob_matching_from_select into lp:duplicity

To: mp+288206@xxxxxxxxxxxxxxxxxx
From: Aaron Whitehouse <lists@xxxxxxxxxxxxxxxxxx>
Date: Sat, 05 Mar 2016 18:52:24 -0000
Reply-to: mp+288206@xxxxxxxxxxxxxxxxxx
Sender: bounces@xxxxxxxxxxxxx

Aaron Whitehouse has proposed merging lp:~aaron-whitehouse/duplicity/split_glob_matching_from_select into lp:duplicity.

Commit message:
Move glob matching code out of selection.py's Select function and into globmatch.py.

Requested reviews:
  duplicity-team (duplicity-team)

For more details, see:
https://code.launchpad.net/~aaron-whitehouse/duplicity/split_glob_matching_from_select/+merge/288206

Move glob matching code out of selection.py's Select function and into globmatch.py for maintainability of the code base.

The Select function in selection.py is quite difficult to follow and is currently very long. Much of the bulk of the function comprises glob matching, which does not require any state from the class. Moving this out into globmatch.py will allow these more obvious functions to be maintained without the need to understand how the Select function works/interacts with other parts of duplicity. 

This should also make it easier to replace parts of the glob matching code, such as replacing the regular expression approach with fnmatch, without unexpected effects to the rest of the code base.


-- 
Your team duplicity-team is requested to review the proposed merge of lp:~aaron-whitehouse/duplicity/split_glob_matching_from_select into lp:duplicity.

=== added file 'duplicity/globmatch.py'
--- duplicity/globmatch.py	1970-01-01 00:00:00 +0000
+++ duplicity/globmatch.py	2016-03-05 18:51:18 +0000
@@ -0,0 +1,135 @@
+# -*- Mode:Python; indent-tabs-mode:nil; tab-width:4 -*-
+#
+# Copyright 2002 Ben Escoto <ben@xxxxxxxxxxx>
+# Copyright 2007 Kenneth Loafman <kenneth@xxxxxxxxxxx>
+# Copyright 2014 Aaron Whitehouse <aaron@xxxxxxxxxxxxxxxxxx>
+#
+# This file is part of duplicity.
+#
+# Duplicity is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+#
+# Duplicity is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with duplicity; if not, write to the Free Software Foundation,
+# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+import re
+
+
+class GlobbingError(Exception):
+    """Something has gone wrong when parsing a glob string"""
+    pass
+
+
+class FilePrefixError(GlobbingError):
+    """Signals that a specified file doesn't start with correct prefix"""
+    pass
+
+
+def _glob_get_prefix_regexs(glob_str):
+    """Return list of regexps equivalent to prefixes of glob_str"""
+    # Internal. Used by glob_get_normal_sf.
+    glob_parts = glob_str.split("/")
+    if "" in glob_parts[1:-1]:
+        # "" OK if comes first or last, as in /foo/
+        raise GlobbingError("Consecutive '/'s found in globbing string "
+                            + glob_str)
+
+    prefixes = ["/".join(glob_parts[:i + 1]) for i in range(len(glob_parts))]
+    # we must make exception for root "/", only dir to end in slash
+    if prefixes[0] == "":
+        prefixes[0] = "/"
+    return list(map(glob_to_regex, prefixes))
+
+
+def path_matches_glob(path, glob_str, include):
+    """Tests whether path matches glob, as per the Unix shell rules, taking as
+    arguments a path, a glob string and include (0 indicating that the glob
+    string is an exclude glob and 1 indicating that it is an include glob,
+    returning:
+    0 - if the file should be excluded
+    1 - if the file should be included
+    2 - if the folder should be scanned for any included/excluded files
+    None - if the selection function has nothing to say about the file
+    """
+    match_only_dirs = False
+
+    # ToDo: Test behaviour of "/" on its own - should always match
+    if glob_str != "/" and glob_str[-1] == "/":
+        match_only_dirs = True
+        # Remove trailing / from directory name (unless that is the entire
+        # string)
+        glob_str = glob_str[:-1]
+
+    re_comp = lambda r: re.compile(r, re.S)
+
+    # matches what glob matches and any files in directory
+    glob_comp_re = re_comp("^%s($|/)" % glob_to_regex(glob_str))
+
+    if glob_str.find("**") != -1:
+        glob_str = glob_str[:glob_str.find("**") + 2]  # truncate after **
+
+    scan_comp_re = re_comp("^(%s)$" %
+                           "|".join(_glob_get_prefix_regexs(glob_str)))
+
+    if match_only_dirs and not path.isdir():
+        # If the glob ended with a /, only match directories
+        return None
+    elif glob_comp_re.match(path.name):
+        return include
+    elif include == 1 and scan_comp_re.match(path.name):
+        return 2
+    else:
+        return None
+
+
+def glob_to_regex(pat):
+    """Returned regular expression equivalent to shell glob pat
+
+    Currently only the ?, *, [], and ** expressions are supported.
+    Ranges like [a-z] are also currently unsupported.  There is no
+    way to quote these special characters.
+
+    This function taken with minor modifications from efnmatch.py
+    by Donovan Baarda.
+
+    """
+    # Internal. Used by glob_get_normal_sf, glob_get_prefix_res and unit tests.
+    i, n, res = 0, len(pat), ''
+    while i < n:
+        c, s = pat[i], pat[i:i + 2]
+        i = i + 1
+        if s == '**':
+            res = res + '.*'
+            i = i + 1
+        elif c == '*':
+            res = res + '[^/]*'
+        elif c == '?':
+            res = res + '[^/]'
+        elif c == '[':
+            j = i
+            if j < n and pat[j] in '!^':
+                j = j + 1
+            if j < n and pat[j] == ']':
+                j = j + 1
+            while j < n and pat[j] != ']':
+                j = j + 1
+            if j >= n:
+                res = res + '\\['  # interpret the [ literally
+            else:
+                # Deal with inside of [..]
+                stuff = pat[i:j].replace('\\', '\\\\')
+                i = j + 1
+                if stuff[0] in '!^':
+                    stuff = '^' + stuff[1:]
+                res = res + '[' + stuff + ']'
+        else:
+            res = res + re.escape(c)
+    return res

=== modified file 'duplicity/selection.py'
--- duplicity/selection.py	2016-01-29 11:43:58 +0000
+++ duplicity/selection.py	2016-03-05 18:51:18 +0000
@@ -23,7 +23,6 @@
 from future_builtins import filter, map
 
 import os  # @UnusedImport
-import re  # @UnusedImport
 import stat  # @UnusedImport
 import sys
 
@@ -32,6 +31,8 @@
 from duplicity import globals  # @Reimport
 from duplicity import diffdir
 from duplicity import util  # @Reimport
+from duplicity.globmatch import GlobbingError, FilePrefixError, \
+    path_matches_glob
 
 """Iterate exactly the requested files in a directory
 
@@ -46,16 +47,6 @@
     pass
 
 
-class FilePrefixError(SelectError):
-    """Signals that a specified file doesn't start with correct prefix"""
-    pass
-
-
-class GlobbingError(SelectError):
-    """Something has gone wrong when parsing a glob string"""
-    pass
-
-
 class Select:
     """Iterate appropriate Paths in given directory
 
@@ -540,57 +531,24 @@
         """
         # Internal. Used by glob_get_sf and unit tests.
 
-        match_only_dirs = False
-
-        if glob_str != "/" and glob_str[-1] == "/":
-            match_only_dirs = True
-            # Remove trailing / from directory name (unless that is the entire
-            # string)
-            glob_str = glob_str[:-1]
+        ignore_case = False
 
         if glob_str.lower().startswith("ignorecase:"):
-            re_comp = lambda r: re.compile(r, re.I | re.S)
-            glob_str = glob_str[len("ignorecase:"):]
-        else:
-            re_comp = lambda r: re.compile(r, re.S)
-
-        # matches what glob matches and any files in directory
-        glob_comp_re = re_comp("^%s($|/)" % self.glob_to_re(glob_str))
-
-        if glob_str.find("**") != -1:
-            glob_str = glob_str[:glob_str.find("**") + 2]  # truncate after **
-
-        scan_comp_re = re_comp("^(%s)$" %
-                               "|".join(self.glob_get_prefix_res(glob_str)))
-
-        def include_sel_func(path):
-            if match_only_dirs and not path.isdir():
-                # If the glob ended with a /, only match directories
-                return None
-            elif glob_comp_re.match(path.name):
-                return 1
-            elif scan_comp_re.match(path.name):
-                return 2
-            else:
-                return None
-
-        def exclude_sel_func(path):
-            if match_only_dirs and not path.isdir():
-                # If the glob ended with a /, only match directories
-                return None
-            elif glob_comp_re.match(path.name):
-                return 0
-            else:
-                return None
+            # Glob string starts with ignorecase, so remove that from the
+            # string and change it to lowercase.
+            glob_str = glob_str[len("ignorecase:"):].lower()
+            ignore_case = True
 
         # Check to make sure prefix is ok
-        if not include_sel_func(self.rootpath):
+        if not path_matches_glob(self.rootpath, glob_str, include=1):
             raise FilePrefixError(glob_str)
 
-        if include:
-            return include_sel_func
-        else:
-            return exclude_sel_func
+        def sel_func(path):
+            if ignore_case:
+                path.name = path.name.lower()
+            return path_matches_glob(path, glob_str, include)
+
+        return sel_func
 
     def exclude_older_get_sf(self, date):
         """Return selection function based on files older than modification date """
@@ -609,62 +567,3 @@
         sel_func.exclude = True
         sel_func.name = "Select older than %s" % (date,)
         return sel_func
-
-    def glob_get_prefix_res(self, glob_str):
-        """Return list of regexps equivalent to prefixes of glob_str"""
-        # Internal. Used by glob_get_normal_sf.
-        glob_parts = glob_str.split("/")
-        if "" in glob_parts[1:-1]:
-            # "" OK if comes first or last, as in /foo/
-            raise GlobbingError("Consecutive '/'s found in globbing string "
-                                + glob_str)
-
-        prefixes = ["/".join(glob_parts[:i + 1]) for i in range(len(glob_parts))]
-        # we must make exception for root "/", only dir to end in slash
-        if prefixes[0] == "":
-            prefixes[0] = "/"
-        return map(self.glob_to_re, prefixes)
-
-    def glob_to_re(self, pat):
-        """Returned regular expression equivalent to shell glob pat
-
-        Currently only the ?, *, [], and ** expressions are supported.
-        Ranges like [a-z] are also currently unsupported.  There is no
-        way to quote these special characters.
-
-        This function taken with minor modifications from efnmatch.py
-        by Donovan Baarda.
-
-        """
-        # Internal. Used by glob_get_normal_sf, glob_get_prefix_res and unit tests.
-        i, n, res = 0, len(pat), ''
-        while i < n:
-            c, s = pat[i], pat[i:i + 2]
-            i = i + 1
-            if s == '**':
-                res = res + '.*'
-                i = i + 1
-            elif c == '*':
-                res = res + '[^/]*'
-            elif c == '?':
-                res = res + '[^/]'
-            elif c == '[':
-                j = i
-                if j < n and pat[j] in '!^':
-                    j = j + 1
-                if j < n and pat[j] == ']':
-                    j = j + 1
-                while j < n and pat[j] != ']':
-                    j = j + 1
-                if j >= n:
-                    res = res + '\\['  # interpret the [ literally
-                else:
-                    # Deal with inside of [..]
-                    stuff = pat[i:j].replace('\\', '\\\\')
-                    i = j + 1
-                    if stuff[0] in '!^':
-                        stuff = '^' + stuff[1:]
-                    res = res + '[' + stuff + ']'
-            else:
-                res = res + re.escape(c)
-        return res

=== added file 'testing/unit/test_globmatch.py'
--- testing/unit/test_globmatch.py	1970-01-01 00:00:00 +0000
+++ testing/unit/test_globmatch.py	2016-03-05 18:51:18 +0000
@@ -0,0 +1,41 @@
+# -*- Mode:Python; indent-tabs-mode:nil; tab-width:4 -*-
+#
+# Copyright 2002 Ben Escoto <ben@xxxxxxxxxxx>
+# Copyright 2007 Kenneth Loafman <kenneth@xxxxxxxxxxx>
+# Copyright 2014 Aaron Whitehouse <aaron@xxxxxxxxxxxxxxxxxx>
+#
+# This file is part of duplicity.
+#
+# Duplicity is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+#
+# Duplicity is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with duplicity; if not, write to the Free Software Foundation,
+# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+import unittest
+from duplicity.globmatch import *
+from . import UnitTestCase
+
+
+class MatchingTest(UnitTestCase):
+    """Test matching of file names against various selection functions"""
+
+    def test_glob_re(self):
+        """test_glob_re - test translation of shell pattern to regular exp"""
+        assert glob_to_regex("hello") == "hello"
+        assert glob_to_regex(".e?ll**o") == "\\.e[^/]ll.*o"
+        r = glob_to_regex("[abc]el[^de][!fg]h")
+        assert r == "[abc]el[^de][^fg]h", r
+        r = glob_to_regex("/usr/*/bin/")
+        assert r == "\\/usr\\/[^/]*\\/bin\\/", r
+        assert glob_to_regex("[a.b/c]") == "[a.b/c]"
+        r = glob_to_regex("[a*b-c]e[!]]")
+        assert r == "[a*b-c]e[^]]", r
\ No newline at end of file

=== modified file 'testing/unit/test_selection.py'
--- testing/unit/test_selection.py	2016-01-24 21:44:51 +0000
+++ testing/unit/test_selection.py	2016-03-05 18:51:18 +0000
@@ -116,18 +116,6 @@
         assert sf2(self.makeext("what/ever.py")) == 0
         assert sf2(self.makeext("what/ever.py/foo")) == 0
 
-    def test_glob_re(self):
-        """test_glob_re - test translation of shell pattern to regular exp"""
-        assert self.Select.glob_to_re("hello") == "hello"
-        assert self.Select.glob_to_re(".e?ll**o") == "\\.e[^/]ll.*o"
-        r = self.Select.glob_to_re("[abc]el[^de][!fg]h")
-        assert r == "[abc]el[^de][^fg]h", r
-        r = self.Select.glob_to_re("/usr/*/bin/")
-        assert r == "\\/usr\\/[^/]*\\/bin\\/", r
-        assert self.Select.glob_to_re("[a.b/c]") == "[a.b/c]"
-        r = self.Select.glob_to_re("[a*b-c]e[!]]")
-        assert r == "[a*b-c]e[^]]", r
-
     def test_simple_glob_double_asterisk(self):
         """test_simple_glob_double_asterisk - primarily to check that the defaults used by the error tests work"""
         assert self.Select.glob_get_normal_sf("**", 1)

Follow ups

[Merge] lp:~aaron-whitehouse/duplicity/split_glob_matching_from_select into lp:duplicity
From: noreply, 2016-03-07