Skip to content

Commit c1977d3

Browse files
committed
optimize walking over a commit’s tree
- replace shallow-tree-walk with deep-tree-walk - invert control of tree walker: rather than the recursive helper function in commit-tx-data, pass a walker function to deep-tree-walk - a tree walker signals if the path is new, and if not, the deep-tree-walk will skip over uninteresting subtrees - add index to :node/object and make check for existing node much more efficient - fix logic for testing if path is new - make sure future is deref’ed when transacting a commit’s tx data
1 parent 840bddd commit c1977d3

File tree

2 files changed

+196
-100
lines changed

2 files changed

+196
-100
lines changed

src/datomic/codeq/core.clj

Lines changed: 115 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,12 @@
1111
[clojure.java.io :as io]
1212
[clojure.set]
1313
[clojure.string :as string]
14-
[datomic.codeq.util :refer [index->id-fn tempid?]]
1514
[datomic.codeq.analyzer :as az]
1615
[datomic.codeq.git :as git]
1716
[datomic.codeq.analyzers.clj])
17+
(:use datomic.codeq.util)
1818
(:import java.util.Date
19-
org.eclipse.jgit.lib.Repository
20-
org.eclipse.jgit.revwalk.RevCommit)
19+
org.eclipse.jgit.lib.Repository)
2120
(:gen-class))
2221

2322
(set! *warn-on-reflection* true)
@@ -172,6 +171,7 @@
172171
:db/valueType :db.type/ref
173172
:db/cardinality :db.cardinality/one
174173
:db/doc "Git object (tree/blob) in a tree node"
174+
:db/index true
175175
:db.install/_attribute :db.part/db}
176176

177177
{:db/id #db/id[:db.part/db]
@@ -259,77 +259,121 @@
259259
@(d/transact conn schema)))
260260

261261

262+
(defn authors-tx-data
263+
[db {:keys [author committer]}]
264+
(let [tempid? map?
265+
email->id (index->id-fn db :email/address)
266+
authorid (email->id author)
267+
committerid (email->id committer)
268+
tx-data (cond-> []
269+
;;record author's email if new
270+
(tempid? authorid)
271+
(conj [:db/add authorid :email/address author])
272+
;;record committer's email if new and is
273+
;;distinct from the author's email
274+
(and (not= committer author) (tempid? committerid))
275+
(conj [:db/add committerid :email/address committer]))]
276+
[authorid committerid tx-data]))
277+
278+
279+
(defn commit-node-tx-data
280+
[db repoid root-nodeid {:keys [sha msg parents authored committed] :as commit}]
281+
(let [commitid (d/tempid :db.part/user)
282+
[authorid committerid author-tx-data] (authors-tx-data db commit)]
283+
(into author-tx-data
284+
[[:db/add repoid :repo/commits commitid]
285+
{:db/id (d/tempid :db.part/tx)
286+
:tx/commit commitid
287+
:tx/op :import}
288+
(cond-> {:db/id commitid
289+
:git/type :commit
290+
:commit/tree root-nodeid
291+
:git/sha sha
292+
:commit/author authorid
293+
:commit/authoredAt authored
294+
:commit/committer committerid
295+
:commit/committedAt committed}
296+
msg
297+
(assoc :commit/message msg)
298+
parents
299+
(assoc :commit/parents
300+
(mapv (fn [p]
301+
(if-let [id (index-get-id db :git/sha p)]
302+
id
303+
(throw (ex-info "Parent not previously imported"
304+
{:sha sha :parent p}))))
305+
parents)))])))
306+
307+
262308
(defn commit-tx-data
263-
[db repo repoid repo-name {:keys [sha msg tree parents author authored committer committed] :as commit}]
309+
[db repo repoid repo-name {:keys [sha msg tree parents authored committed] :as commit}]
264310
(let [tempid? map? ;;todo - better pred
265311
sha->id (index->id-fn db :git/sha)
266-
email->id (index->id-fn db :email/address)
267312
filename->id (index->id-fn db :file/name)
268-
authorid (email->id author)
269-
committerid (email->id committer)
270-
cid (d/tempid :db.part/user)
271-
tx-data (fn f [inpath [sha type filename]]
272-
(let [path (str inpath filename)
273-
id (sha->id sha)
274-
filenameid (filename->id filename)
275-
pathid (filename->id path)
276-
nodeid (or (and (not (tempid? id))
277-
(not (tempid? filenameid))
278-
(ffirst (d/q '[:find ?e :in $ ?filename ?id
279-
:where [?e :node/filename ?filename] [?e :node/object ?id]]
280-
db filenameid id)))
281-
(d/tempid :db.part/user))
282-
newpath (or (tempid? pathid) (tempid? nodeid)
283-
(not (ffirst (d/q '[:find ?node :in $ ?path
284-
:where [?node :node/paths ?path]]
285-
db pathid))))
286-
data (cond-> []
287-
(tempid? filenameid) (conj [:db/add filenameid :file/name filename])
288-
(tempid? pathid) (conj [:db/add pathid :file/name path])
289-
(tempid? nodeid) (conj {:db/id nodeid :node/filename filenameid :node/object id})
290-
newpath (conj [:db/add nodeid :node/paths pathid])
291-
(tempid? id) (conj {:db/id id :git/sha sha :git/type type}))
292-
data (if (and newpath (= type :tree))
293-
(let [rev-tree (git/lookup-rev-tree repo sha)
294-
es (git/shallow-tree-walk repo rev-tree)]
295-
(reduce (fn [data child]
296-
(let [[cid cdata] (f (str path "/") child)
297-
data (into data cdata)]
298-
(cond-> data
299-
(tempid? id) (conj [:db/add id :tree/nodes cid]))))
300-
data es))
301-
data)]
302-
[nodeid data]))
303-
[treeid treedata] (tx-data nil [tree :tree repo-name])
304-
tx (into treedata
305-
[[:db/add repoid :repo/commits cid]
306-
{:db/id (d/tempid :db.part/tx)
307-
:tx/commit cid
308-
:tx/op :import}
309-
(cond-> {:db/id cid
310-
:git/type :commit
311-
:commit/tree treeid
312-
:git/sha sha
313-
:commit/author authorid
314-
:commit/authoredAt authored
315-
:commit/committer committerid
316-
:commit/committedAt committed
317-
}
318-
msg (assoc :commit/message msg)
319-
parents (assoc :commit/parents
320-
(mapv (fn [p]
321-
(let [id (sha->id p)]
322-
(assert (not (tempid? id))
323-
(str "Parent " p " not previously imported"))
324-
id))
325-
parents)))])
326-
tx (cond-> tx
327-
(tempid? authorid)
328-
(conj [:db/add authorid :email/address author])
329-
330-
(and (not= committer author) (tempid? committerid))
331-
(conj [:db/add committerid :email/address committer]))]
332-
tx))
313+
commitid (d/tempid :db.part/user)
314+
;;find a node by object and filename id
315+
check-for-node
316+
(fn [object-id filename-id]
317+
(some #(let [{eid :e} %]
318+
(if (seq (d/datoms db :eavt eid :node/filename filename-id))
319+
eid))
320+
(d/datoms db :avet :node/object object-id)))
321+
322+
walker-fn
323+
(fn [{:keys [sha type path filename parent]}]
324+
(let [;;lookup id for file/tree object by tree sha
325+
objid (sha->id sha)
326+
;;lookup id for file name
327+
filenameid (filename->id filename)
328+
;;lookup id for complete path
329+
pathid (filename->id path)
330+
;;lookup id for tree node
331+
;;new if either tree sha or filename are new
332+
;;i.e. file with new content
333+
nodeid (or (and (not (tempid? objid))
334+
(not (tempid? filenameid))
335+
(check-for-node objid filenameid))
336+
(d/tempid :db.part/user))
337+
;;path is new if: 1. path name is new, or 2. path name
338+
;;exists but tree node is new (file name and sha are
339+
;;unique), or 3. path name and tree node both exist
340+
;;but the former is not already a path of the latter.
341+
newpath (or (tempid? pathid) (tempid? nodeid)
342+
(every? #(not= nodeid (:e %))
343+
(d/datoms db :vaet pathid :node/paths)))
344+
data (cond->
345+
[]
346+
;;record file name if new
347+
(tempid? filenameid)
348+
(conj [:db/add filenameid :file/name filename])
349+
;;record path name if new
350+
(tempid? pathid)
351+
(conj [:db/add pathid :file/name path])
352+
;;record tree node if new
353+
(tempid? nodeid)
354+
(conj {:db/id nodeid
355+
:node/filename filenameid
356+
:node/object objid})
357+
;;link tree node to new paths
358+
newpath
359+
(conj [:db/add nodeid :node/paths pathid])
360+
;;link new tree node to parent
361+
(and (tempid? nodeid) parent)
362+
(conj [:db/add parent :tree/nodes nodeid])
363+
;;record the sha of file/tree objects
364+
(tempid? objid)
365+
(conj {:db/id objid :git/sha sha :git/type type}))]
366+
;;emit the tree node, its object,
367+
;;and the accumulated tx data
368+
;;indicating if the tree node is new
369+
{:node-id nodeid
370+
:object-id objid
371+
:new-path newpath
372+
:data data}))
373+
374+
[root-nodeid treedata] (git/deep-tree-walk repo repo-name tree walker-fn)]
375+
(into treedata
376+
(commit-node-tx-data db repoid root-nodeid commit))))
333377

334378

335379
(defn unimported-commits
@@ -367,7 +411,7 @@
367411
(doseq [commit commits]
368412
(let [db (d/db conn)]
369413
(println "Importing commit:" (:sha commit))
370-
(d/transact conn (commit-tx-data db repo repoid repo-name commit))))
414+
@(d/transact conn (commit-tx-data db repo repoid repo-name commit))))
371415
(d/request-index conn)
372416
(println "Import complete!")))
373417

src/datomic/codeq/git.clj

Lines changed: 81 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
org.eclipse.jgit.storage.file.FileRepositoryBuilder))
88

99

10+
(set! *warn-on-reflection* true)
11+
12+
1013
(defn ^Repository open-existing-repo
1114
"Open an exisiting git repository by
1215
scanning GIT_* environment variables
@@ -59,7 +62,11 @@
5962
(proxy [RevFilter] []
6063
(clone [] this)
6164
(include [rev-walk rev-commit]
62-
(nil? (imported-commits (.getName ^RevCommit rev-commit))))
65+
(let [sha (.getName ^RevCommit rev-commit)
66+
incl (nil? (imported-commits sha))]
67+
(when-not incl
68+
(println "Skipping commit: " sha))
69+
incl))
6370
(requiresCommitBody [] false))]
6471
(seq (doto rev-walk
6572
(.markStart rev-commit)
@@ -85,32 +92,77 @@
8592
:committed (.getWhen committer)}))
8693

8794

88-
(defn lookup-rev-tree
89-
"Return the RevTree object that resolves from a tree SHA id"
95+
(defn deep-tree-walk
96+
"Walk over the entire tree of repository repo with name repo-name
97+
starting from the point identitfied by tree-sha, using the function
98+
tree-walker to produce transaction data.
99+
100+
Returns the root tree node id along with the accumulation of
101+
transaction data produced by calling tree-walker on each node of
102+
the tree walk.
103+
104+
The tree walker function is given a map
105+
{:sha :type :path :filename :parent}
106+
Where the :type is :tree or :blob. :parent is nil at the root of
107+
the tree walk. The tree walker function must return a map
108+
{:node-id :object-id :new-path :data}
109+
Which contains the transaction data along with the tree node id
110+
and object id for linking the node and object. Also the boolean
111+
:new-path indicates if the path that the walker has processed is
112+
new. If so, deep-tree-walk will step into subtrees."
90113
[^Repository repo
91-
^String sha]
92-
(->> sha
93-
(.resolve repo)
94-
(.parseTree (RevWalk. repo))))
95-
96-
97-
(defn shallow-tree-walk
98-
"Walks one level of a RevTree object,
99-
returning the trees and blobs contained therein.
100-
101-
Returns [[sha (:tree OR :blob) file-name] ...]"
102-
[^Repository repo
103-
^RevTree tree]
104-
(let [tree-walk (doto (TreeWalk. repo)
105-
(.addTree tree)
106-
(.setRecursive false))
107-
dir (transient [])]
108-
(while (.next tree-walk)
109-
(conj! dir
110-
[(.. tree-walk (getObjectId 0) (getName))
111-
(if (= (.getFileMode tree-walk 0)
112-
FileMode/TREE)
113-
:tree :blob)
114-
(.getNameString tree-walk)]))
115-
(persistent! dir)))
116-
114+
repo-name
115+
^String tree-sha
116+
tree-walker]
117+
(let [;;resolve tree-sha to a revision tree
118+
rev-tree (->> tree-sha
119+
(.resolve repo)
120+
(.parseTree (RevWalk. repo)))
121+
;;set revision tree as starting point for tree walk
122+
tree-walk (doto (TreeWalk. repo) (.addTree rev-tree))
123+
;;create a root tree node from the repository name
124+
{root-nodeid :node-id root-treeid :object-id new-root :new-path seed-data :data}
125+
(tree-walker {:sha tree-sha :type :tree
126+
:path repo-name :filename repo-name
127+
:parent nil})]
128+
(if-not new-root
129+
;;if root node is not new, then there is nothing to walk
130+
[root-nodeid seed-data]
131+
(loop [stack (list root-treeid)
132+
depth (.getDepth tree-walk)
133+
tx-data seed-data]
134+
(if-not (.next tree-walk)
135+
[root-nodeid tx-data]
136+
(let [curr-id (.getObjectId tree-walk 0)
137+
sha (.getName curr-id)
138+
path (str repo-name "/" (.getPathString tree-walk))
139+
filename (.getNameString tree-walk)]
140+
(cond
141+
;;tree walk is pointing at a tree to step into
142+
(.isSubtree tree-walk)
143+
(let [{:keys [object-id data new-path]}
144+
(tree-walker {:sha sha :type :tree
145+
:path path :filename filename
146+
:parent (peek stack)})]
147+
(if new-path
148+
;;enter subtree only if it's a new path
149+
(do (.enterSubtree tree-walk)
150+
(recur (conj stack object-id) (.getDepth tree-walk)
151+
(into tx-data data)))
152+
;;else skip over it
153+
(recur stack depth (into tx-data data))))
154+
;;depth has decrease so we must have popped out of a subtree
155+
(< (.getDepth tree-walk) depth)
156+
(let [new-depth (.getDepth tree-walk)
157+
new-stack (seq (drop (- depth new-depth) stack))
158+
{:keys [data]} (tree-walker {:sha sha :type :blob
159+
:path path :filename filename
160+
:parent (peek new-stack)})]
161+
(recur new-stack new-depth (into tx-data data)))
162+
;;else continue at same depth with another blob
163+
:else
164+
(let [{:keys [data]}
165+
(tree-walker {:sha sha :type :blob
166+
:path path :filename filename
167+
:parent (peek stack)})]
168+
(recur stack depth (into tx-data data))))))))))

0 commit comments

Comments
 (0)