Skip to content

Commit 0cf6ed2

Browse files
jaikiranjonathan-gibbons
authored andcommittedJan 22, 2024
7903580: Allow for re-attempting agent creation when an attempt fails
Reviewed-by: jjg, cstein
1 parent fcaa9e8 commit 0cf6ed2

File tree

6 files changed

+159
-9
lines changed

6 files changed

+159
-9
lines changed
 

‎src/share/classes/com/sun/javatest/regtest/exec/Agent.java

+107-5
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
import java.io.InputStreamReader;
3838
import java.io.PrintStream;
3939
import java.io.PrintWriter;
40+
import java.lang.reflect.Method;
4041
import java.net.InetSocketAddress;
4142
import java.net.ServerSocket;
4243
import java.net.Socket;
@@ -90,12 +91,30 @@ public static class Fault extends Exception {
9091
static final boolean showAgent = Flags.get("showAgent");
9192
static final boolean traceAgent = Flags.get("traceAgent");
9293

94+
// the following code here allows us to run jtreg on older
95+
// JDKs where the pid() method is unavailable on the
96+
// Process class. we use PID only for debug purposes and
97+
// the inability to get the PID of a launched AgentServer
98+
// is OK.
99+
private static final long UNKNOWN_PID = -1;
100+
private static final Method PID_METHOD;
101+
static {
102+
Method pidMethod = null;
103+
try {
104+
pidMethod = Process.class.getDeclaredMethod("pid"); // only available in Java 9+
105+
} catch (Exception e) {
106+
pidMethod = null;
107+
}
108+
PID_METHOD = pidMethod;
109+
}
110+
93111
/**
94112
* Start a JDK with given JVM options.
95113
*/
96114
private Agent(File dir, JDK jdk, List<String> vmOpts, Map<String, String> envVars,
97115
File policyFile, float timeoutFactor, Logger logger,
98116
String testThreadFactory, String testThreadFactoryPath) throws Fault {
117+
Process agentServerProcess = null;
99118
try {
100119
id = ++count;
101120
this.jdk = jdk;
@@ -127,8 +146,9 @@ private Agent(File dir, JDK jdk, List<String> vmOpts, Map<String, String> envVar
127146
// is platform-specific, and Solaris has it on by default.
128147
ss.setReuseAddress(false);
129148
ss.bind(new InetSocketAddress(/*port:*/ 0), /*backlog:*/ 1);
149+
final int port = ss.getLocalPort();
130150
cmd.add(AgentServer.PORT);
131-
cmd.add(String.valueOf(ss.getLocalPort()));
151+
cmd.add(String.valueOf(port));
132152

133153
if (timeoutFactor != 1.0f) {
134154
cmd.add(AgentServer.TIMEOUTFACTOR);
@@ -144,22 +164,27 @@ private Agent(File dir, JDK jdk, List<String> vmOpts, Map<String, String> envVar
144164
cmd.add(CUSTOM_TEST_THREAD_FACTORY_PATH);
145165
cmd.add(testThreadFactoryPath);
146166
}
147-
log("Started " + cmd);
167+
log("Launching " + cmd);
148168

149169
ProcessBuilder pb = new ProcessBuilder(cmd);
150170
pb.directory(dir);
151171
Map<String, String> env = pb.environment();
152172
env.clear();
153173
env.putAll(envVars);
154-
process = pb.start();
174+
agentServerProcess = process = pb.start();
175+
final long pid = getPid(process);
155176
copyAgentProcessStream("stdout", process.getInputStream());
156177
copyAgentProcessStream("stderr", process.getErrorStream());
157178

158179
try {
159180
final int ACCEPT_TIMEOUT = (int) (60 * 1000 * timeoutFactor);
160-
// default 60 seconds, for server to start and "phone home"
181+
// default 60 seconds, for server to start and "phone home"
161182
ss.setSoTimeout(ACCEPT_TIMEOUT);
183+
log("Waiting up to " + ACCEPT_TIMEOUT + " milli seconds for a" +
184+
" socket connection on port " + port +
185+
(pid != UNKNOWN_PID ? " from process " + pid : ""));
162186
Socket s = ss.accept();
187+
log("Received connection on port " + port + " from " + s);
163188
s.setSoTimeout((int)(KeepAlive.READ_TIMEOUT * timeoutFactor));
164189
in = new DataInputStream(s.getInputStream());
165190
out = new DataOutputStream(s.getOutputStream());
@@ -171,6 +196,16 @@ private Agent(File dir, JDK jdk, List<String> vmOpts, Map<String, String> envVar
171196
// send keep-alive messages to server while not executing actions
172197
keepAlive.setEnabled(true);
173198
} catch (IOException e) {
199+
log("Agent creation failed due to " + e);
200+
if (agentServerProcess != null) {
201+
// kill the launched process
202+
log("killing AgentServer process");
203+
try {
204+
ProcessUtils.destroyForcibly(agentServerProcess);
205+
} catch (Exception ignored) {
206+
// ignore
207+
}
208+
}
174209
throw new Fault(e);
175210
}
176211
}
@@ -578,6 +613,17 @@ private void log(String message, PrintStream out) {
578613
out.println("[" + AgentServer.logDateFormat.format(new Date()) + "] Agent[" + getId() + "]: " + message);
579614
}
580615

616+
private static long getPid(final Process process) {
617+
if (PID_METHOD == null) {
618+
return UNKNOWN_PID;
619+
}
620+
try {
621+
return (long) PID_METHOD.invoke(process);
622+
} catch (Exception e) {
623+
return UNKNOWN_PID;
624+
}
625+
}
626+
581627
final JDK jdk;
582628
final List<String> vmOpts;
583629
final File execDir;
@@ -730,6 +776,20 @@ public void setMaxPoolSize(int size) {
730776
logger.log(null, "POOL: max pool size: " + maxPoolSize);
731777
}
732778

779+
/**
780+
* Sets the maximum attempts to create or obtain an agent VM
781+
* @param numAttempts number of attempts
782+
* @throws IllegalArgumentException if {@code numAttempts} is less than {@code 1}
783+
*/
784+
public void setNumAgentSelectionAttempts(final int numAttempts) {
785+
if (numAttempts < 1) {
786+
throw new IllegalArgumentException("invalid value for agent selection attempts: "
787+
+ numAttempts);
788+
}
789+
this.numAgentSelectionAttempts = numAttempts;
790+
logger.log(null, "POOL: agent selection attempts: " + numAttempts);
791+
}
792+
733793
/**
734794
* Obtains an agent with the desired properties.
735795
* If a suitable agent already exists in the pool, it will be removed from the pool and
@@ -745,7 +805,48 @@ public void setMaxPoolSize(int size) {
745805
* @return the agent
746806
* @throws Fault if there is a problem obtaining a suitable agent
747807
*/
748-
synchronized Agent getAgent(File dir,
808+
Agent getAgent(File dir,
809+
JDK jdk,
810+
List<String> vmOpts,
811+
Map<String, String> envVars,
812+
String testThreadFactory,
813+
String testThreadFactoryPath)
814+
throws Fault {
815+
final int numAttempts = this.numAgentSelectionAttempts;
816+
assert numAttempts > 0 : "unexpected agent selection attempts: " + numAttempts;
817+
Agent.Fault toThrow = null;
818+
for (int i = 1; i <= numAttempts; i++) {
819+
try {
820+
if (i != 1) {
821+
logger.log(null, "POOL: re-attempting agent creation, attempt number " + i);
822+
}
823+
return doGetAgent(dir, jdk, vmOpts, envVars, testThreadFactory,
824+
testThreadFactoryPath);
825+
} catch (Agent.Fault f) {
826+
logger.log(null, "POOL: agent creation failed due to " + f.getCause());
827+
// keep track of the fault and reattempt to get an agent if within limit
828+
if (toThrow == null) {
829+
toThrow = f;
830+
} else {
831+
// add the previous exception as a suppressed exception
832+
// of the current one
833+
if (toThrow.getCause() != null) {
834+
f.addSuppressed(toThrow.getCause());
835+
}
836+
toThrow = f;
837+
}
838+
if (i == numAttempts || !(f.getCause() instanceof IOException)) {
839+
// we either made enough attempts or we failed due to a non IOException.
840+
// In either case we don't attempt to create an agent again and instead
841+
// throw the captured failure(s)
842+
throw toThrow;
843+
}
844+
}
845+
}
846+
throw new AssertionError("should not reach here");
847+
}
848+
849+
synchronized Agent doGetAgent(File dir,
749850
JDK jdk,
750851
List<String> vmOpts,
751852
Map<String, String> envVars,
@@ -927,6 +1028,7 @@ private static String getKey(File dir, JDK jdk, List<String> vmOpts) {
9271028
private float timeoutFactor = 1.0f;
9281029
private int maxPoolSize;
9291030
private Duration idleTimeout;
1031+
private int numAgentSelectionAttempts;
9301032
}
9311033

9321034
static class Stats {

‎src/share/classes/com/sun/javatest/regtest/tool/Tool.java

+23
Original file line numberDiff line numberDiff line change
@@ -461,6 +461,22 @@ public void process(String opt, String arg) throws BadArgs {
461461
}
462462
},
463463

464+
new Option(GNU, AGENT_POOL, null, "--agent-attempts") {
465+
@Override
466+
public void process(String opt, String arg) throws BadArgs {
467+
int numTimes;
468+
try {
469+
numTimes = Integer.parseInt(arg);
470+
} catch (NumberFormatException e) {
471+
throw new BadArgs(i18n, "main.badAgentSelAttempt", arg);
472+
}
473+
if (numTimes < 1) {
474+
throw new BadArgs(i18n, "main.badAgentSelAttempt", numTimes);
475+
}
476+
numAgentSelectionAttempt = numTimes;
477+
}
478+
},
479+
464480
new Option(STD, MAIN, "", "-conc", "-concurrency") {
465481
@Override
466482
public void process(String opt, String arg) {
@@ -1343,6 +1359,7 @@ public int run() throws BadArgs, Fault, Harness.Fault, InterruptedException {
13431359
}
13441360
p.setMaxPoolSize(maxPoolSize);
13451361
p.setIdleTimeout(poolIdleTimeout);
1362+
p.setNumAgentSelectionAttempts(numAgentSelectionAttempt);
13461363
break;
13471364
case OTHERVM:
13481365
break;
@@ -2373,6 +2390,8 @@ public enum ReportMode { NONE, EXECUTED, FILES, ALL_EXECUTED, ALL }
23732390
private String testThreadFactoryPathArg;
23742391
private int maxPoolSize = -1;
23752392
private Duration poolIdleTimeout = Duration.ofSeconds(30);
2393+
// number of attempts to get an agent for an action
2394+
private int numAgentSelectionAttempt = DEFAULT_NUM_AGENT_SEL_ATTEMPT;
23762395
private List<String> testCompilerOpts = new ArrayList<>();
23772396
private List<String> testJavaOpts = new ArrayList<>();
23782397
private List<String> testVMOpts = new ArrayList<>();
@@ -2421,5 +2440,9 @@ public enum ReportMode { NONE, EXECUTED, FILES, ALL_EXECUTED, ALL }
24212440
"windir"
24222441
};
24232442

2443+
// default value for agent selection attempts. we default to 1, which implies
2444+
// by default we don't re-attempt on a failure
2445+
private static final int DEFAULT_NUM_AGENT_SEL_ATTEMPT = 1;
2446+
24242447
private static final I18NResourceBundle i18n = I18NResourceBundle.getBundleForClass(Tool.class);
24252448
}

‎src/share/classes/com/sun/javatest/regtest/tool/i18n.properties

+7
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,12 @@ help.pool.pool_idle_timeout.arg=<number>
294294
help.pool.pool_idle_timeout.desc=\
295295
The time, in seconds, before an idle VM in the pool is \
296296
automatically closed.
297+
help.pool.agent_attempts.arg=<number>
298+
help.pool.agent_attempts.desc=\
299+
The number of attempts jtreg will make to access an agent \
300+
with the desired characteristics, either by creating a new one, \
301+
or obtaining one from the pool of reusable VMs. The minimum value, \
302+
and the default value, is 1.
297303

298304
help.select.name=Test Selection Options
299305
help.select.desc=These options can be used to refine the set of tests to \
@@ -446,6 +452,7 @@ help.version.txt={0} {1}\nInstalled in {2}\nRunning on platform version {3} from
446452
help.version.unknown=(unknown)
447453

448454
main.badArgs=Error: {0}
455+
main.badAgentSelAttempt=Bad value for agent selection attempts: {0}
449456
main.badConcurrency=Bad use of -concurrency
450457
main.badKeywords=Bad keyword expression: {0}
451458
main.badLockFile=Bad lock file: {0}

‎src/share/doc/javatest/regtest/faq.md

+18
Original file line numberDiff line numberDiff line change
@@ -699,6 +699,24 @@ if the default mode was not used. For example,
699699
elapsed time (seconds): 0.141
700700
````
701701

702+
#### jtreg has trouble starting agents; what can I do?
703+
704+
When running in agent mode, JTReg creates agent VMs as and when necessary.
705+
Agent VM creation involves launching a process and communicating with it over
706+
a socket. The initial handshake between the newly launched process and JTReg
707+
can sometimes timeout if the system is under heavy load. This then causes the
708+
agent creation to fail. By default, JTReg does not re-attempt creation of the
709+
agent VM and instead the failure is propagated as a test action failure and
710+
that action gets reported as a failed.
711+
712+
This default behaviour can be overridden by passing the `--agent-attempts` option
713+
to `jtreg` command. This option takes an integer value which represents the number
714+
of attempts to make when attempting to get an agent for a test action. By default,
715+
the value of this option is `1`, implying JTReg will not re-attempt a failed
716+
attempt. Passing a higher value for this option will allow JTReg to re-attempt a
717+
failed attempt. For example, a value of `2` will allow JTReg to re-attempt once
718+
for each failed attempt.
719+
702720
### How do I specify whether to run tests concurrently?
703721

704722
jtreg provides the ability to run tests in parallel, using multiple

‎test/badtests/BadTests.gmk

+1-1
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ $(BUILDTESTDIR)/BadTests.agentvm.ok: \
6868
> $(@:%.ok=%/jt.log) 2>&1 || \
6969
true "non-zero exit code from JavaTest intentionally ignored"
7070
$(GREP) -s 'Test results: passed: 17; failed: 1; error: 2' $(@:%.ok=%/jt.log) > /dev/null
71-
agents=`$(GREP) '^\[[-0-9 :,]*\] Agent\[[0-9][0-9]*\]: Started' $(@:%.ok=%/jt.log) | wc -l` ; \
71+
agents=`$(GREP) '^\[[-0-9 :,]*\] Agent\[[0-9][0-9]*\]: Launching' $(@:%.ok=%/jt.log) | wc -l` ; \
7272
if [ $${agents} -ne 4 ]; then echo "Unexpected number of agents used: $${agents}" ; exit 1 ; fi
7373
echo "test passed at `date`" > $@
7474

‎test/debug/DebugTest.gmk

+3-3
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,10 @@ $(BUILDTESTDIR)/DebugTest.agentvm.ok: \
6262
-debug:-Ddebug=true \
6363
$(TESTDIR)/debug \
6464
> $(@:%.ok=%/jt.log) 2>&1 ; rc=$$?
65-
agents=`$(GREP) '^\[[-0-9 :,]*\] Agent...: Started' $(@:%.ok=%)/jt.log | wc -l | tr -d ' '` ; \
66-
debugAgents=`$(GREP) '^\[[-0-9 :,]*\] Agent...: Started.*-Ddebug=true' $(@:%.ok=%)/jt.log | wc -l | tr -d ' '` ; \
65+
agents=`$(GREP) '^\[[-0-9 :,]*\] Agent...: Launching' $(@:%.ok=%)/jt.log | wc -l | tr -d ' '` ; \
66+
debugAgents=`$(GREP) '^\[[-0-9 :,]*\] Agent...: Launching.*-Ddebug=true' $(@:%.ok=%)/jt.log | wc -l | tr -d ' '` ; \
6767
if [ "$$agents" != 2 -o "$$debugAgents" != 1 ]; then \
68-
$(GREP) '^\[[-0-9 :,]*\] Agent...: Started' $(@:%.ok=%)/jt.log ; \
68+
$(GREP) '^\[[-0-9 :,]*\] Agent...: Launching' $(@:%.ok=%)/jt.log ; \
6969
echo "unexpected use of -debug options"; exit 1; \
7070
fi
7171
echo "test passed at `date`" > $@

0 commit comments

Comments
 (0)
Please sign in to comment.