|
@@ -33,9 +33,82 @@ def check_indent(codestr):
|
|
|
return codestr
|
|
|
|
|
|
|
|
|
+# Basically pickle, in python 2.7.3 at least, does badly with data duplication
|
|
|
+# upon pickling and unpickling. Combine this with duplicate objects and things
|
|
|
+# are a mess.
|
|
|
+#
|
|
|
+# When the sets are originally created, python calls intern() on the set keys
|
|
|
+# which significantly improves memory usage. Sadly the pickle/unpickle process
|
|
|
+# doesn't call intern() on the keys and results in the same strings being duplicated
|
|
|
+# in memory. This also means pickle will save the same string multiple times in
|
|
|
+# the cache file.
|
|
|
+#
|
|
|
+# By having shell and python cacheline objects with setstate/getstate, we force
|
|
|
+# the object creation through our own routine where we can call intern (via internSet).
|
|
|
+#
|
|
|
+# We also use hashable frozensets and ensure we use references to these so that
|
|
|
+# duplicates can be removed, both in memory and in the resulting pickled data.
|
|
|
+#
|
|
|
+# By playing these games, the size of the cache file shrinks dramatically
|
|
|
+# meaning faster load times and the reloaded cache files also consume much less
|
|
|
+# memory. Smaller cache files, faster load times and lower memory usage is good.
|
|
|
+#
|
|
|
+# A custom getstate/setstate using tuples is actually worth 15% cachesize by
|
|
|
+# avoiding duplication of the attribute names!
|
|
|
+
|
|
|
+class SetCache(object):
|
|
|
+ def __init__(self):
|
|
|
+ self.setcache = {}
|
|
|
+
|
|
|
+ def internSet(self, items):
|
|
|
+
|
|
|
+ new = []
|
|
|
+ for i in items:
|
|
|
+ new.append(intern(i))
|
|
|
+ s = frozenset(new)
|
|
|
+ if hash(s) in self.setcache:
|
|
|
+ return self.setcache[hash(s)]
|
|
|
+ self.setcache[hash(s)] = s
|
|
|
+ return s
|
|
|
+
|
|
|
+codecache = SetCache()
|
|
|
+
|
|
|
+class pythonCacheLine(object):
|
|
|
+ def __init__(self, refs, execs, contains):
|
|
|
+ self.refs = codecache.internSet(refs)
|
|
|
+ self.execs = codecache.internSet(execs)
|
|
|
+ self.contains = {}
|
|
|
+ for c in contains:
|
|
|
+ self.contains[c] = codecache.internSet(contains[c])
|
|
|
+
|
|
|
+ def __getstate__(self):
|
|
|
+ return (self.refs, self.execs, self.contains)
|
|
|
+
|
|
|
+ def __setstate__(self, state):
|
|
|
+ (refs, execs, contains) = state
|
|
|
+ self.__init__(refs, execs, contains)
|
|
|
+ def __hash__(self):
|
|
|
+ l = (hash(self.refs), hash(self.execs))
|
|
|
+ for c in sorted(self.contains.keys()):
|
|
|
+ l = l + (c, hash(self.contains[c]))
|
|
|
+ return hash(l)
|
|
|
+
|
|
|
+class shellCacheLine(object):
|
|
|
+ def __init__(self, execs):
|
|
|
+ self.execs = codecache.internSet(execs)
|
|
|
+
|
|
|
+ def __getstate__(self):
|
|
|
+ return (self.execs)
|
|
|
+
|
|
|
+ def __setstate__(self, state):
|
|
|
+ (execs) = state
|
|
|
+ self.__init__(execs)
|
|
|
+ def __hash__(self):
|
|
|
+ return hash(self.execs)
|
|
|
+
|
|
|
class CodeParserCache(MultiProcessCache):
|
|
|
cache_file_name = "bb_codeparser.dat"
|
|
|
- CACHE_VERSION = 6
|
|
|
+ CACHE_VERSION = 7
|
|
|
|
|
|
def __init__(self):
|
|
|
MultiProcessCache.__init__(self)
|
|
@@ -44,6 +117,27 @@ class CodeParserCache(MultiProcessCache):
|
|
|
self.pythoncacheextras = self.cachedata_extras[0]
|
|
|
self.shellcacheextras = self.cachedata_extras[1]
|
|
|
|
|
|
+ # To avoid duplication in the codeparser cache, keep
|
|
|
+ # a lookup of hashes of objects we already have
|
|
|
+ self.pythoncachelines = {}
|
|
|
+ self.shellcachelines = {}
|
|
|
+
|
|
|
+ def newPythonCacheLine(self, refs, execs, contains):
|
|
|
+ cacheline = pythonCacheLine(refs, execs, contains)
|
|
|
+ h = hash(cacheline)
|
|
|
+ if h in self.pythoncachelines:
|
|
|
+ return self.pythoncachelines[h]
|
|
|
+ self.pythoncachelines[h] = cacheline
|
|
|
+ return cacheline
|
|
|
+
|
|
|
+ def newShellCacheLine(self, execs):
|
|
|
+ cacheline = shellCacheLine(execs)
|
|
|
+ h = hash(cacheline)
|
|
|
+ if h in self.shellcachelines:
|
|
|
+ return self.shellcachelines[h]
|
|
|
+ self.shellcachelines[h] = cacheline
|
|
|
+ return cacheline
|
|
|
+
|
|
|
def init_cache(self, d):
|
|
|
MultiProcessCache.init_cache(self, d)
|
|
|
|
|
@@ -51,25 +145,6 @@ class CodeParserCache(MultiProcessCache):
|
|
|
self.pythoncache = self.cachedata[0]
|
|
|
self.shellcache = self.cachedata[1]
|
|
|
|
|
|
- def compress_keys(self, data):
|
|
|
- # When the dicts are originally created, python calls intern() on the set keys
|
|
|
- # which significantly improves memory usage. Sadly the pickle/unpickle process
|
|
|
- # doesn't call intern() on the keys and results in the same strings being duplicated
|
|
|
- # in memory. This also means pickle will save the same string multiple times in
|
|
|
- # the cache file. By interning the data here, the cache file shrinks dramatically
|
|
|
- # meaning faster load times and the reloaded cache files also consume much less
|
|
|
- # memory. This is worth any performance hit from this loops and the use of the
|
|
|
- # intern() data storage.
|
|
|
- # Python 3.x may behave better in this area
|
|
|
- for h in data[0]:
|
|
|
- data[0][h]["refs"] = self.internSet(data[0][h]["refs"])
|
|
|
- data[0][h]["execs"] = self.internSet(data[0][h]["execs"])
|
|
|
- for k in data[0][h]["contains"]:
|
|
|
- data[0][h]["contains"][k] = self.internSet(data[0][h]["contains"][k])
|
|
|
- for h in data[1]:
|
|
|
- data[1][h]["execs"] = self.internSet(data[1][h]["execs"])
|
|
|
- return
|
|
|
-
|
|
|
def create_cachedata(self):
|
|
|
data = [{}, {}]
|
|
|
return data
|
|
@@ -168,15 +243,19 @@ class PythonParser():
|
|
|
h = hash(str(node))
|
|
|
|
|
|
if h in codeparsercache.pythoncache:
|
|
|
- self.references = codeparsercache.pythoncache[h]["refs"]
|
|
|
- self.execs = codeparsercache.pythoncache[h]["execs"]
|
|
|
- self.contains = codeparsercache.pythoncache[h]["contains"]
|
|
|
+ self.references = set(codeparsercache.pythoncache[h].refs)
|
|
|
+ self.execs = set(codeparsercache.pythoncache[h].execs)
|
|
|
+ self.contains = {}
|
|
|
+ for i in codeparsercache.pythoncache[h].contains:
|
|
|
+ self.contains[i] = set(codeparsercache.pythoncache[h].contains[i])
|
|
|
return
|
|
|
|
|
|
if h in codeparsercache.pythoncacheextras:
|
|
|
- self.references = codeparsercache.pythoncacheextras[h]["refs"]
|
|
|
- self.execs = codeparsercache.pythoncacheextras[h]["execs"]
|
|
|
- self.contains = codeparsercache.pythoncacheextras[h]["contains"]
|
|
|
+ self.references = set(codeparsercache.pythoncacheextras[h].refs)
|
|
|
+ self.execs = set(codeparsercache.pythoncacheextras[h].execs)
|
|
|
+ self.contains = {}
|
|
|
+ for i in codeparsercache.pythoncacheextras[h].contains:
|
|
|
+ self.contains[i] = set(codeparsercache.pythoncacheextras[h].contains[i])
|
|
|
return
|
|
|
|
|
|
code = compile(check_indent(str(node)), "<string>", "exec",
|
|
@@ -188,10 +267,7 @@ class PythonParser():
|
|
|
|
|
|
self.execs.update(self.var_execs)
|
|
|
|
|
|
- codeparsercache.pythoncacheextras[h] = {}
|
|
|
- codeparsercache.pythoncacheextras[h]["refs"] = self.references
|
|
|
- codeparsercache.pythoncacheextras[h]["execs"] = self.execs
|
|
|
- codeparsercache.pythoncacheextras[h]["contains"] = self.contains
|
|
|
+ codeparsercache.pythoncacheextras[h] = codeparsercache.newPythonCacheLine(self.references, self.execs, self.contains)
|
|
|
|
|
|
class ShellParser():
|
|
|
def __init__(self, name, log):
|
|
@@ -210,18 +286,17 @@ class ShellParser():
|
|
|
h = hash(str(value))
|
|
|
|
|
|
if h in codeparsercache.shellcache:
|
|
|
- self.execs = codeparsercache.shellcache[h]["execs"]
|
|
|
+ self.execs = set(codeparsercache.shellcache[h].execs)
|
|
|
return self.execs
|
|
|
|
|
|
if h in codeparsercache.shellcacheextras:
|
|
|
- self.execs = codeparsercache.shellcacheextras[h]["execs"]
|
|
|
+ self.execs = set(codeparsercache.shellcacheextras[h].execs)
|
|
|
return self.execs
|
|
|
|
|
|
self._parse_shell(value)
|
|
|
self.execs = set(cmd for cmd in self.allexecs if cmd not in self.funcdefs)
|
|
|
|
|
|
- codeparsercache.shellcacheextras[h] = {}
|
|
|
- codeparsercache.shellcacheextras[h]["execs"] = self.execs
|
|
|
+ codeparsercache.shellcacheextras[h] = codeparsercache.newShellCacheLine(self.execs)
|
|
|
|
|
|
return self.execs
|
|
|
|