Skip to content

Commit 3019feb

Browse files
authored
CODEC-335: Add DigestUtils.gitBlob and DigestUtils.gitTree methods (#427)
This change adds two methods to `DigestUtils` that compute generalized Git object identifiers using an arbitrary `MessageDigest`, rather than being restricted to SHA-1: - `gitBlob(digest, input)`: computes a generalized [Git blob object identifier](https://git-scm.com/book/en/v2/Git-Internals-Git-Objects) for a given file or byte content. - `gitTree(digest, file)`: computes a generalized [Git tree object identifier](https://git-scm.com/book/en/v2/Git-Internals-Git-Objects) for a given directory. ### Motivation The standard Git object identifiers use SHA-1, which is [in the process of being replaced by SHA-256](https://git-scm.com/docs/hash-function-transition) in Git itself. These methods generalize the identifier computation to support any `MessageDigest`, enabling both forward compatibility and use with external standards. In particular, the `swh:1:cnt:` (content) and `swh:1:dir:` (directory) identifier types defined by [SWHID (ISO/IEC 18670)](https://www.swhid.org/specification/v1.2/5.Core_identifiers/) are currently compatible with Git blob and tree identifiers respectively (using SHA-1), and can be used to generate canonical, persistent identifiers for unpacked source and binary distributions.
1 parent 0646e8c commit 3019feb

8 files changed

Lines changed: 491 additions & 0 deletions

File tree

src/changes/changes.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ The <action> type attribute can be add,update,fix,remove.
5252
<!-- ADD -->
5353
<action type="add" dev="ggregory" due-to="Inkeet, Gary Gregory, Wolff Bock von Wuelfingen" issue="CODEC-326">Add Base58 support.</action>
5454
<action type="add" dev="ggregory" due-to="Gary Gregory">Add BaseNCodecInputStream.AbstracBuilder.setByteArray(byte[]).</action>
55+
<action type="add" issue="CODEC-335" dev="pkarwasz" due-to="Piotr P. Karwasz">Add DigestUtils.gitBlob() and DigestUtils.gitTree() to compute Git blob and tree object identifiers.</action>
5556
<!-- UPDATE -->
5657
<action type="update" dev="ggregory" due-to="Gary Gregory">Bump org.apache.commons:commons-parent from 96 to 97.</action>
5758
</release>

src/main/java/org/apache/commons/codec/digest/DigestUtils.java

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,17 +18,24 @@
1818
package org.apache.commons.codec.digest;
1919

2020
import java.io.BufferedInputStream;
21+
import java.io.ByteArrayOutputStream;
2122
import java.io.File;
2223
import java.io.IOException;
2324
import java.io.InputStream;
2425
import java.io.RandomAccessFile;
2526
import java.nio.ByteBuffer;
2627
import java.nio.channels.FileChannel;
28+
import java.nio.charset.StandardCharsets;
29+
import java.nio.file.DirectoryStream;
2730
import java.nio.file.Files;
2831
import java.nio.file.OpenOption;
2932
import java.nio.file.Path;
3033
import java.security.MessageDigest;
3134
import java.security.NoSuchAlgorithmException;
35+
import java.util.ArrayList;
36+
import java.util.Collection;
37+
import java.util.List;
38+
import java.util.TreeSet;
3239

3340
import org.apache.commons.codec.binary.Hex;
3441
import org.apache.commons.codec.binary.StringUtils;
@@ -139,6 +146,134 @@ public static byte[] digest(final MessageDigest messageDigest, final RandomAcces
139146
return updateDigest(messageDigest, data).digest();
140147
}
141148

149+
/**
150+
* Reads through a byte array and return a generalized Git blob identifier
151+
*
152+
* <p>The identifier is computed in the way described by the
153+
* <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#52-contents">SWHID contents identifier</a>, but it can use any hash
154+
* algorithm.</p>
155+
*
156+
* <p>When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.</p>
157+
*
158+
* @param messageDigest The MessageDigest to use (for example SHA-1).
159+
* @param data Data to digest.
160+
* @return A generalized Git blob identifier.
161+
* @since 1.22.0
162+
*/
163+
public static byte[] gitBlob(final MessageDigest messageDigest, final byte[] data) {
164+
messageDigest.reset();
165+
updateDigest(messageDigest, gitBlobPrefix(data.length));
166+
return digest(messageDigest, data);
167+
}
168+
169+
/**
170+
* Reads through a byte array and return a generalized Git blob identifier
171+
*
172+
* <p>The identifier is computed in the way described by the
173+
* <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#52-contents">SWHID contents identifier</a>, but it can use any hash
174+
* algorithm.</p>
175+
*
176+
* <p>When the hash algorithm is SHA-1, the identifier is identical to Git blob identifier and SWHID contents identifier.</p>
177+
*
178+
* @param messageDigest The MessageDigest to use (for example SHA-1).
179+
* @param data Data to digest.
180+
* @param options Options how to open the file
181+
* @return A generalized Git blob identifier.
182+
* @throws IOException On error accessing the file
183+
* @since 1.22.0
184+
*/
185+
public static byte[] gitBlob(final MessageDigest messageDigest, final Path data, final OpenOption... options) throws IOException {
186+
messageDigest.reset();
187+
updateDigest(messageDigest, gitBlobPrefix(Files.size(data)));
188+
return updateDigest(messageDigest, data, options).digest();
189+
}
190+
191+
private static byte[] gitBlobPrefix(final long dataSize) {
192+
return ("blob " + dataSize + "\0").getBytes(StandardCharsets.UTF_8);
193+
}
194+
195+
/**
196+
* Returns a generalized Git tree identifier
197+
*
198+
* <p>The identifier is computed in the way described by the
199+
* <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#53-directories">SWHID directory identifier</a>, but it can use any hash
200+
* algorithm.</p>
201+
*
202+
* <p>When the hash algorithm is SHA-1, the identifier is identical to Git tree identifier and SWHID directory identifier.</p>
203+
*
204+
* @param messageDigest The MessageDigest to use (for example SHA-1)
205+
* @param entries The directory entries
206+
* @return A generalized Git tree identifier.
207+
*/
208+
static byte[] gitTree(final MessageDigest messageDigest, final Collection<GitDirectoryEntry> entries) {
209+
final TreeSet<GitDirectoryEntry> treeSet = new TreeSet<>(entries);
210+
final ByteArrayOutputStream baos = new ByteArrayOutputStream();
211+
for (final GitDirectoryEntry entry : treeSet) {
212+
final byte[] treeEntryBytes = entry.toTreeEntryBytes();
213+
baos.write(treeEntryBytes, 0, treeEntryBytes.length);
214+
}
215+
messageDigest.reset();
216+
updateDigest(messageDigest, gitTreePrefix(baos.size()));
217+
return updateDigest(messageDigest, baos.toByteArray()).digest();
218+
}
219+
220+
/**
221+
* Reads through a byte array and return a generalized Git tree identifier
222+
*
223+
* <p>The identifier is computed in the way described by the
224+
* <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#53-directories">SWHID directory identifier</a>, but it can use any hash
225+
* algorithm.</p>
226+
*
227+
* <p>When the hash algorithm is SHA-1, the identifier is identical to Git tree identifier and SWHID directory identifier.</p>
228+
*
229+
* @param messageDigest The MessageDigest to use (for example SHA-1).
230+
* @param data Data to digest.
231+
* @param options Options how to open the file
232+
* @return A generalized Git tree identifier.
233+
* @throws IOException On error accessing the file
234+
* @since 1.22.0
235+
*/
236+
public static byte[] gitTree(final MessageDigest messageDigest, final Path data, final OpenOption... options) throws IOException {
237+
final List<GitDirectoryEntry> entries = new ArrayList<>();
238+
try (DirectoryStream<Path> files = Files.newDirectoryStream(data)) {
239+
for (final Path path : files) {
240+
final GitDirectoryEntry.Type type = getGitDirectoryEntryType(path);
241+
final byte[] rawObjectId;
242+
if (type == GitDirectoryEntry.Type.DIRECTORY) {
243+
rawObjectId = gitTree(messageDigest, path, options);
244+
} else {
245+
rawObjectId = gitBlob(messageDigest, path, options);
246+
}
247+
entries.add(new GitDirectoryEntry(path, type, rawObjectId));
248+
}
249+
}
250+
return gitTree(messageDigest, entries);
251+
}
252+
253+
/**
254+
* Returns the {@link GitDirectoryEntry.Type} of a file.
255+
*
256+
* @param path The file to check.
257+
* @return A {@link GitDirectoryEntry.Type}
258+
*/
259+
private static GitDirectoryEntry.Type getGitDirectoryEntryType(final Path path) {
260+
// Symbolic links first
261+
if (Files.isSymbolicLink(path)) {
262+
return GitDirectoryEntry.Type.SYMBOLIC_LINK;
263+
}
264+
if (Files.isDirectory(path)) {
265+
return GitDirectoryEntry.Type.DIRECTORY;
266+
}
267+
if (Files.isExecutable(path)) {
268+
return GitDirectoryEntry.Type.EXECUTABLE;
269+
}
270+
return GitDirectoryEntry.Type.REGULAR;
271+
}
272+
273+
private static byte[] gitTreePrefix(final long dataSize) {
274+
return ("tree " + dataSize + "\0").getBytes(StandardCharsets.UTF_8);
275+
}
276+
142277
/**
143278
* Gets a {@code MessageDigest} for the given {@code algorithm}.
144279
*
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* https://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.commons.codec.digest;
19+
20+
import java.nio.charset.StandardCharsets;
21+
import java.nio.file.Path;
22+
import java.util.Objects;
23+
24+
/**
25+
* Represents a single entry in a Git tree object.
26+
*
27+
* <p>A Git tree object encodes a directory snapshot. Each entry holds:</p>
28+
* <ul>
29+
* <li>a {@link Type} that determines the Unix file mode (e.g. {@code 100644} for a regular file),</li>
30+
* <li>the entry name (file or directory name, without a path separator),</li>
31+
* <li>the raw object id of the referenced blob or sub-tree.</li>
32+
* </ul>
33+
*
34+
* <p>Entries are ordered by {@link #compareTo} using Git's tree-sort rule: directory names are compared as if they ended with {@code '/'}, so that {@code foo/}
35+
* sorts after {@code foobar}.</p>
36+
*
37+
* <p>Call {@link #toTreeEntryBytes()} to obtain the binary encoding that Git feeds to its hash function when computing the tree object identifier.</p>
38+
*
39+
* @see <a href="https://git-scm.com/book/en/v2/Git-Internals-Git-Objects">Git Internals – Git Objects</a>
40+
* @see <a href="https://www.swhid.org/swhid-specification/v1.2/5.Core_identifiers/#53-directories">SWHID Directory Identifier</a>
41+
*/
42+
class GitDirectoryEntry implements Comparable<GitDirectoryEntry> {
43+
44+
/**
45+
* The entry name (file or directory name, no path separator).
46+
*/
47+
private final String name;
48+
49+
/**
50+
* The key used for ordering entries within a tree object.
51+
*
52+
* <p>>Git appends {@code '/'} to directory names before comparing.</p>
53+
*/
54+
private final String sortKey;
55+
56+
/**
57+
* The Git object type, which determines the Unix file-mode prefix.
58+
*/
59+
private final Type type;
60+
61+
/**
62+
* The raw object id of the referenced blob or sub-tree.
63+
*/
64+
private final byte[] rawObjectId;
65+
66+
private static String getFileName(final Path path) {
67+
final Path fileName = path.getFileName();
68+
if (fileName == null) {
69+
throw new IllegalArgumentException(path.toString());
70+
}
71+
return fileName.toString();
72+
}
73+
74+
/**
75+
* Creates an entry
76+
*
77+
* @param name The name of the entry
78+
* @param type The type of the entry
79+
* @param rawObjectId The id of the entry
80+
*/
81+
private GitDirectoryEntry(final String name, final Type type, final byte[] rawObjectId) {
82+
this.name = name;
83+
this.type = type;
84+
this.sortKey = type == Type.DIRECTORY ? name + "/" : name;
85+
this.rawObjectId = rawObjectId;
86+
}
87+
88+
/**
89+
* Creates an entry
90+
*
91+
* @param path The path of the entry; must not be an empty path
92+
* @param type The type of the entry
93+
* @param rawObjectId The id of the entry
94+
* @throws IllegalArgumentException If the path is empty
95+
* @throws NullPointerException If any argument is {@code null}
96+
*/
97+
GitDirectoryEntry(final Path path, final Type type, final byte[] rawObjectId) {
98+
this(getFileName(path), Objects.requireNonNull(type), Objects.requireNonNull(rawObjectId));
99+
}
100+
101+
/**
102+
* Returns the binary encoding of this entry as it appears inside a Git tree object.
103+
*
104+
* <p>The format follows the Git tree entry layout:</p>
105+
* <pre>
106+
* &lt;mode&gt; SP &lt;name&gt; NUL &lt;20-byte-object-id&gt;
107+
* </pre>
108+
*
109+
* @return the binary tree-entry encoding; never {@code null}
110+
*/
111+
byte[] toTreeEntryBytes() {
112+
final byte[] nameBytes = name.getBytes(StandardCharsets.UTF_8);
113+
final byte[] result = new byte[type.mode.length + nameBytes.length + rawObjectId.length + 2];
114+
System.arraycopy(type.mode, 0, result, 0, type.mode.length);
115+
result[type.mode.length] = ' ';
116+
System.arraycopy(nameBytes, 0, result, type.mode.length + 1, nameBytes.length);
117+
result[type.mode.length + nameBytes.length + 1] = '\0';
118+
System.arraycopy(rawObjectId, 0, result, type.mode.length + nameBytes.length + 2, rawObjectId.length);
119+
return result;
120+
}
121+
122+
@Override
123+
public int compareTo(GitDirectoryEntry o) {
124+
return sortKey.compareTo(o.sortKey);
125+
}
126+
127+
@Override
128+
public int hashCode() {
129+
return name.hashCode();
130+
}
131+
132+
@Override
133+
public boolean equals(Object obj) {
134+
if (obj == this) {
135+
return true;
136+
}
137+
if (!(obj instanceof GitDirectoryEntry)) {
138+
return false;
139+
}
140+
final GitDirectoryEntry other = (GitDirectoryEntry) obj;
141+
return name.equals(other.name);
142+
}
143+
144+
/**
145+
* The type of a Git tree entry, which maps to a Unix file-mode string.
146+
*
147+
* <p>Git encodes the file type and permission bits as an ASCII octal string that precedes the entry name in the binary tree format. The values defined here
148+
* cover the four entry types that Git itself produces.</p>
149+
*
150+
* <p>This enum is package-private. If it were made public, {@link #mode} would need to be wrapped in an immutable copy to prevent external mutation.</p>
151+
*/
152+
enum Type {
153+
154+
/**
155+
* A sub-directory (Git sub-tree)
156+
*/
157+
DIRECTORY("40000"),
158+
159+
/**
160+
* An executable file
161+
*/
162+
EXECUTABLE("100755"),
163+
164+
/**
165+
* A regular (non-executable) file
166+
*/
167+
REGULAR("100644"),
168+
169+
/**
170+
* A symbolic link
171+
*/
172+
SYMBOLIC_LINK("120000");
173+
174+
/**
175+
* The ASCII-encoded octal mode string as it appears in the binary tree entry.
176+
*/
177+
private final byte[] mode;
178+
179+
Type(final String mode) {
180+
this.mode = mode.getBytes(StandardCharsets.US_ASCII);
181+
}
182+
}
183+
}

0 commit comments

Comments
 (0)