Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -87,4 +87,17 @@ default String[] getSources() {
default int getParallelism() {
return Runtime.getRuntime().availableProcessors();
}

/**
* @return The number of source files to include in each batch when parsing with a shared
* compilation environment. A larger batch size amortises the cost of classpath
* initialisation over more files but increases the peak heap required during the
* parse phase; a smaller value reduces peak heap at the cost of more initialisations.
* The default of {@code 500} suits a 512 MB heap and typical-sized Java source
* files. Reduce this value if you encounter {@link OutOfMemoryError} on a large
* codebase, or increase it if you have a generous heap budget.
*/
default int getBatchSize() {
return 500;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import org.alfasoftware.astra.core.utils.ClassVisitor;
import org.eclipse.jdt.core.dom.ASTNode;
import org.eclipse.jdt.core.dom.CompilationUnit;
import org.eclipse.jdt.core.dom.IExtendedModifier;
import org.eclipse.jdt.core.dom.MethodDeclaration;
import org.eclipse.jdt.core.dom.Modifier;
import org.eclipse.jdt.core.dom.rewrite.ASTRewrite;
Expand All @@ -31,7 +32,9 @@ public void run(final CompilationUnit compilationUnit, final ASTNode node, final
.stream()
// find any public modifiers
.map(MethodDeclaration::modifiers)
.flatMap(List<Modifier>::stream)
.flatMap(List<IExtendedModifier>::stream)
.filter(Modifier.class::isInstance)
.map(Modifier.class::cast)
.filter(Modifier::isPublic)
// remove them
.forEach(m -> rewriter.remove(m, null));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,11 @@
import java.time.Instant;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
Expand All @@ -26,12 +29,14 @@

import org.alfasoftware.astra.core.refactoring.UseCase;
import org.alfasoftware.astra.core.refactoring.operations.imports.UnusedImportRefactor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.eclipse.jdt.core.dom.ASTParser;
import org.eclipse.jdt.core.dom.CompilationUnit;
import org.eclipse.jdt.core.dom.FileASTRequestor;
import org.eclipse.jdt.core.dom.rewrite.ASTRewrite;
import org.eclipse.jface.text.BadLocationException;
import org.eclipse.text.edits.MalformedTreeException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* AstraCore operates on source files in an input directory, building an AST for each file, using any additional classpaths supplied. 
Expand Down Expand Up @@ -61,7 +66,7 @@ public static void run(String targetDirectoryPath, UseCase useCase) {
validateSourceAndClasspath(sources, classPath);
try {
AstraCore main = new AstraCore();
main.runOperations(targetDirectoryPath, useCase, sources, classPath);
main.runOperations(targetDirectoryPath, useCase, sources, AstraUtils.filterClassPath(classPath));
} catch (IOException e) {
throw new RuntimeException("Astra run failed for directory [" + targetDirectoryPath + "]: " + e.getMessage(), e);
}
Expand All @@ -83,20 +88,23 @@ protected void runOperations(String directoryPath, UseCase useCase, String[] sou
AtomicLong currentPercentage = new AtomicLong();
Instant startTime = Instant.now();

// The same file-selection filter (.java extension + the UseCase path prefiltering predicate)
// is used for both the count walk and the processing walk, so the progress denominator can
// never drift from the set of files actually processed.
// Build the single file-selection filter used for path scanning, progress tracking, and
// chunk partitioning. Using one shared Predicate instance ensures the progress denominator
// can never drift from the set of files actually processed.
Path sourcePath = Paths.get(directoryPath);
Predicate<Path> fileFilter = buildFileFilter(useCase);

// First walk: a cheap count pass that reads only directory metadata (not file contents) to
// determine the total number of files to process, which feeds the progress percentage below.
log.info("Counting files (this may take a few seconds)");
long totalFiles;
// Single walk: collect all matching paths so we can (a) derive the progress denominator
// without a second walk and (b) partition the list into fixed-size chunks for
// bounded-memory batch parsing. Only Path objects are materialised here — file contents
// are read lazily, one chunk at a time.
log.info("Scanning for files (this may take a few seconds)");
List<Path> allPaths = new ArrayList<>();
try (Stream<Path> walk = Files.walk(sourcePath)) {
totalFiles = walk.filter(fileFilter).count();
walk.filter(fileFilter).forEach(allPaths::add);
}
log.info(totalFiles + " files to process after prefiltering");
long totalFiles = allPaths.size();
log.info(totalFiles + " files to process after path based prefiltering");

if (totalFiles == 0) {
log.info(getPrintableDuration(Duration.between(startTime, Instant.now())));
Expand All @@ -105,36 +113,107 @@ protected void runOperations(String directoryPath, UseCase useCase, String[] sou

Set<? extends ASTOperation> operations = useCase.getOperations();
int parallelism = useCase.getParallelism();
int batchSize = useCase.getBatchSize();
Predicate<String> contentPrefilteringPredicate = useCase.getContentPrefilteringPredicate();
log.info("Processing [" + totalFiles + "] files with [" + parallelism + "] thread(s)");
log.info("Processing [" + totalFiles + "] files with [" + parallelism + "] thread(s), batch size [" + batchSize + "]");

// Process files in fixed-size chunks to keep peak heap bounded. For each chunk we read
// content, apply the content-prefiltering predicate, and batch-parse only the files that
// pass. The per-chunk Maps and CompilationUnit objects are eligible for GC as soon as the
// chunk's futures have been waited on, so peak heap scales with batchSize — not totalFiles.
List<Throwable> fileErrors = new ArrayList<>();
ExecutorService executor = Executors.newFixedThreadPool(parallelism);
int numberOfChunks = (int) Math.ceil((double) allPaths.size() / batchSize);
if (numberOfChunks > 1) {
log.info("Processing in [" + numberOfChunks + "] chunk(s) of up to [" + batchSize + "] file(s) each");
}

try {
// Second walk: stream paths directly into the executor as they are encountered, without
// materialising every path into a List first. This avoids holding all paths in memory at
// once on very large source trees.
List<Future<?>> futures = new ArrayList<>();
try (Stream<Path> walk = Files.walk(sourcePath)) {
walk.filter(fileFilter)
.forEach(f -> futures.add(executor.submit(() -> applyOperationsAndSave(f, operations, sources, classPath, contentPrefilteringPredicate))));
}
for (int chunkIndex = 0; chunkIndex < numberOfChunks; chunkIndex++) {
int chunkStart = chunkIndex * batchSize;
int chunkEnd = Math.min(chunkStart + batchSize, allPaths.size());
List<Path> chunk = allPaths.subList(chunkStart, chunkEnd);

// Read and content-prefilter only this chunk's files. Limiting content reads to one
// chunk at a time keeps peak heap proportional to batchSize rather than totalFiles.
List<Path> chunkToParse = new ArrayList<>();
Map<String, String> chunkContent = new LinkedHashMap<>();
List<Path> chunkContentFiltered = new ArrayList<>();
Map<Path, RuntimeException> chunkReadFailures = new LinkedHashMap<>();

for (Path path : chunk) {
try {
String content = new String(Files.readAllBytes(path.toAbsolutePath()));
if (contentPrefilteringPredicate.test(content)) {
chunkToParse.add(path);
chunkContent.put(path.toAbsolutePath().normalize().toString(), content);
} else {
log.debug("Skipping [{}] — excluded by content pre-filtering predicate", path);
chunkContentFiltered.add(path);
}
} catch (IOException e) {
chunkReadFailures.put(path, new RuntimeException(
"Failed to read file [" + path + "]: " + e.getMessage(), e));
}
}

for (Future<?> future : futures) {
try {
future.get();
} catch (ExecutionException e) {
Throwable cause = e.getCause();
log.error("Failed to process file: " + cause.getMessage(), cause);
fileErrors.add(cause);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new IOException("File processing was interrupted", e);
if (numberOfChunks > 1) {
log.info("Batch parsing chunk [" + (chunkIndex + 1) + "/" + numberOfChunks + "] — " + chunkToParse.size() + " file(s)");
} else {
log.info("Batch parsing [" + chunkToParse.size() + "] file(s) with shared compilation environment");
}
long idx = currentFileIndex.incrementAndGet();
long newPct = idx * 100 / totalFiles;
if (currentPercentage.getAndSet(newPct) != newPct) {
logProgress(idx, newPct, startTime, totalFiles);

Map<String, CompilationUnit> parsedUnits = batchParseFiles(chunkToParse, sources, classPath);

List<Future<?>> chunkFutures = new ArrayList<>();

// Submit work futures for files that were batch-parsed.
for (Path path : chunkToParse) {
String key = path.toAbsolutePath().normalize().toString();
CompilationUnit cu = parsedUnits.get(key);
String content = chunkContent.get(key);
if (cu != null && content != null) {
chunkFutures.add(executor.submit(() ->
applyOperationsAndSaveWithPreParsedCompilationUnit(path, content, cu, operations, sources, classPath)));
} else {
// Defensive fallback: batch parse did not return a CU (should not happen with JDT).
log.warn("Batch parse produced no CompilationUnit for [{}]; falling back to per-file parse", path);
chunkFutures.add(executor.submit(() ->
applyOperationsAndSave(path, operations, sources, classPath, s -> true)));
}
}

// Submit failure futures for files that could not be read (surfaces them via future.get()).
for (Map.Entry<Path, RuntimeException> entry : chunkReadFailures.entrySet()) {
RuntimeException ex = entry.getValue();
chunkFutures.add(executor.submit((Runnable) () -> { throw ex; }));
}

// Submit no-op futures for content-filtered files so that they count toward the progress
// denominator, preserving the same progress behaviour as the previous per-file code path.
for (int i = 0; i < chunkContentFiltered.size(); i++) {
chunkFutures.add(executor.submit(() -> {}));
}

// Wait for all futures in this chunk before parsing the next chunk. This ensures that
// the CompilationUnit objects captured by the submitted tasks can be garbage-collected
// before the next chunk is loaded, keeping peak heap proportional to batchSize.
for (Future<?> future : chunkFutures) {
try {
future.get();
} catch (ExecutionException e) {
Throwable cause = e.getCause();
log.error("Failed to process file: " + cause.getMessage(), cause);
fileErrors.add(cause);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new IOException("File processing was interrupted", e);
}
long idx = currentFileIndex.incrementAndGet();
long newPct = idx * 100 / totalFiles;
if (currentPercentage.getAndSet(newPct) != newPct) {
logProgress(idx, newPct, startTime, totalFiles);
}
}
}
} finally {
Expand All @@ -155,6 +234,105 @@ protected void runOperations(String directoryPath, UseCase useCase, String[] sou
}


/**
* Batch-parses one chunk of source files using a single shared JDT compilation environment.
* Called once per chunk during a run; the number of paths is bounded by
* {@link UseCase#getBatchSize()}.
*
* <p>A single {@link ASTParser} is configured with the supplied classpath and source paths,
* and {@link ASTParser#createASTs} is called with the chunk's file paths in one shot. JDT
* initialises its internal {@code LookupEnvironment} — which involves scanning every JAR
* and source root on the classpath — exactly once for the chunk rather than once per
* file, which is the primary cost saving over the per-file {@code createAST()} API.
*
* <p><strong>Thread safety:</strong> {@code createASTs()} processes files sequentially on
* the calling thread, calling back into {@code acceptAST()} for each one. Bindings are
* resolved eagerly during parsing; by the time this method returns the returned
* {@link CompilationUnit} objects are fully resolved and can be read safely from multiple
* worker threads in the subsequent parallel operation-application phase — provided those
* threads do not themselves trigger new binding lookups that write to the shared
* {@code LookupEnvironment}. In practice, all Astra operations only <em>read</em>
* already-resolved bindings, so concurrent operation application is safe.
*
* @return a map from normalised absolute path string to {@link CompilationUnit}; the path
* strings are exactly those returned by
* {@link Path#toAbsolutePath()}{@code .normalize().toString()} for each input path.
*/
private static Map<String, CompilationUnit> batchParseFiles(
List<Path> paths, String[] sources, String[] classPath) {

Map<String, CompilationUnit> result = new HashMap<>(paths.size() * 2);

if (paths.isEmpty()) {
return result;
}

ASTParser parser = AstraUtils.createBatchParser(sources, classPath);

String[] absolutePaths = paths.stream()
.map(p -> p.toAbsolutePath().normalize().toString())
.toArray(String[]::new);
String[] fileEncodings = new String[absolutePaths.length];
Arrays.fill(fileEncodings, "UTF-8");

parser.createASTs(absolutePaths, fileEncodings, new String[0],
new FileASTRequestor() {
@Override
public void acceptAST(String sourceFilePath, CompilationUnit ast) {
// sourceFilePath is exactly what we passed in (absolute + normalised).
ast.setProperty(CompilationUnitProperty.ABSOLUTE_PATH,
Paths.get(sourceFilePath).toAbsolutePath());
ast.recordModifications();
result.put(sourceFilePath, ast);
}
}, null);

return result;
}


/**
* Applies {@code operations} to a file whose {@link CompilationUnit} was already produced by
* the batch parse, then runs import cleanup and writes the file back if content changed.
*
* <p>This mirrors the logic in {@link #applyOperationsAndSave} but skips the per-file
* {@link ASTParser} / {@link AstraUtils#readAsCompilationUnit} call that would otherwise
* re-initialise the JDT classpath environment for this individual file.
*
* <p>This method is designed to be called from multiple worker threads in parallel; each
* invocation operates exclusively on its own {@link CompilationUnit} and {@link ASTRewrite},
* so there is no shared mutable state between threads.
*/
private void applyOperationsAndSaveWithPreParsedCompilationUnit(
Path javaFile,
String fileContentBefore,
CompilationUnit preParseUnit,
Set<? extends ASTOperation> operations,
String[] sources,
String[] classpath) {
try {
ASTRewrite rewriter = runOperations(operations, preParseUnit);
String fileContentAfter = makeChangesFromAST(fileContentBefore, rewriter);

if (fileContentAfter.equals(fileContentBefore)) {
return;
}

// File was changed: run import cleanup and write back.
fileContentAfter = applyOperationsToSource(
new HashSet<>(Arrays.asList(new UnusedImportRefactor())),
sources, classpath, javaFile, fileContentAfter);

if (!fileContentAfter.equals(fileContentBefore)) {
Files.write(javaFile.toAbsolutePath(), fileContentAfter.getBytes(),
StandardOpenOption.TRUNCATE_EXISTING);
}
} catch (IOException | BadLocationException | IllegalArgumentException e) {
throw new RuntimeException("Failed to process file [" + javaFile + "]: " + e.getMessage(), e);
}
}


/**
* Builds the single, shared file-selection filter applied to every {@link Path} encountered when
* walking the source directory. This combines the {@code .java} file check with the path-level
Expand Down
Loading
Loading