Make JGroups with TLS startup more robust

Fixes #37887

Signed-off-by: Pedro Ruivo <pruivo@redhat.com>
This commit is contained in:
Pedro Ruivo 2025-03-07 07:21:53 +00:00 committed by GitHub
parent b1785ce179
commit 5efb7cf76e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 67 additions and 29 deletions

View File

@ -66,12 +66,14 @@ public class CertificateReloadManager implements Lifecycle {
public static final String CERTIFICATE_ID = "crt_jgroups";
private static final String JGROUPS_SUBJECT = "jgroups";
private static final Duration RETRY_WAIT_TIME = Duration.ofMinutes(1);
private static final Duration BOOT_PERIOD = Duration.ofMillis(200);
private final KeycloakSessionFactory sessionFactory;
private final JGroupsCertificateHolder certificateHolder;
private volatile long rotationSeconds;
private final AutoCloseableLock lock;
private ScheduledFuture<?> scheduledFuture;
private ScheduledFuture<?> bootFuture;
@Inject
EmbeddedCacheManager cacheManager;
@ -96,16 +98,25 @@ public class CertificateReloadManager implements Lifecycle {
@Override
@Start
public void start() {
logger.debug("Starting JGroups certificate reload manager");
logger.info("Starting JGroups certificate reload manager");
notifier.addListener(this);
reloadCertificate();
scheduleNextRotation();
certificateHolder.setExceptionHandler(this::onInvalidCertificate);
lock.lock();
try(lock) {
// It is invoked before JGroups starts; it schedules a fast pace reload of the certificate.
// It is canceled when it gets a view from JGroups.
// This is here to prevent the case when a node joins during a rotation process.
bootFuture = scheduledExecutorService.scheduleAtFixedRate(() -> blockingManager.runBlocking(this::bootReload, "boot-reload"), BOOT_PERIOD.toMillis(), BOOT_PERIOD.toMillis(), TimeUnit.MILLISECONDS);
}
}
@Override
@Stop
public void stop() {
logger.debug("Stopping JGroups certificate reload manager");
logger.info("Stopping JGroups certificate reload manager");
notifier.removeListener(this);
lock.lock();
try (lock) {
@ -120,7 +131,7 @@ public class CertificateReloadManager implements Lifecycle {
* Creates and reload a new certificate.
*/
public void rotateCertificate() {
logger.debug("Rotate JGroups certificate");
logger.info("Rotating JGroups certificate");
lock.lock();
try (lock) {
KeycloakModelUtils.runJobInTransaction(sessionFactory, this::replaceCertificateInTransaction);
@ -135,9 +146,13 @@ public class CertificateReloadManager implements Lifecycle {
* Reloads the certificate from storage.
*/
public void reloadCertificate() {
logger.debug("Reload JGroups Certificate");
logger.info("Reloading JGroups Certificate");
lock.lock();
try (lock) {
if (bootFuture != null) {
bootFuture.cancel(true);
bootFuture = null;
}
var maybeCrt = KeycloakModelUtils.runJobInTransactionWithResult(sessionFactory, CertificateReloadManager::loadCertificateInTransaction);
if (maybeCrt.isEmpty()) {
return;
@ -183,8 +198,23 @@ public class CertificateReloadManager implements Lifecycle {
}
}
private void bootReload() {
logger.info("[Boot] reloading certificate.");
lock.lock();
try (lock) {
var maybeCrt = KeycloakModelUtils.runJobInTransactionWithResult(sessionFactory, CertificateReloadManager::loadCertificateInTransaction);
if (maybeCrt.isEmpty()) {
return;
}
var crt = JGroupsCertificate.fromJson(maybeCrt.get());
certificateHolder.useCertificate(crt);
} catch (GeneralSecurityException | IOException e) {
logger.warn("Exception on boot reload cycle. Ignoring it.", e);
}
}
private void onInvalidCertificate() {
logger.debug("On certificate exception");
logger.info("On certificate exception");
blockingManager.runBlocking(this::reloadCertificate, "invalid-certificate");
}

View File

@ -40,13 +40,13 @@ public class JpaServerConfigStorageProvider implements ServerConfigStorageProvid
@Override
public Optional<String> find(String key) {
return Optional.ofNullable(getEntity(key, LockModeType.READ))
return Optional.ofNullable(getEntity(key))
.map(ServerConfigEntity::getValue);
}
@Override
public void store(String key, String value) {
var entity = getEntity(key, LockModeType.WRITE);
var entity = getEntity(key);
if (entity == null) {
entity = new ServerConfigEntity();
entity.setKey(Objects.requireNonNull(key));
@ -60,7 +60,7 @@ public class JpaServerConfigStorageProvider implements ServerConfigStorageProvid
@Override
public void remove(String key) {
var entity = getEntity(key, LockModeType.WRITE);
var entity = getEntity(key);
if (entity != null) {
entityManager.remove(entity);
}
@ -68,7 +68,7 @@ public class JpaServerConfigStorageProvider implements ServerConfigStorageProvid
@Override
public String loadOrCreate(String key, Supplier<String> valueGenerator) {
var entity = getEntity(key, LockModeType.OPTIMISTIC);
var entity = getEntity(key);
if (entity != null) {
return entity.getValue();
}
@ -84,7 +84,7 @@ public class JpaServerConfigStorageProvider implements ServerConfigStorageProvid
public boolean replace(String key, Predicate<String> replacePredicate, Supplier<String> valueGenerator) {
Objects.requireNonNull(replacePredicate);
Objects.requireNonNull(valueGenerator);
var entity = getEntity(key, LockModeType.OPTIMISTIC);
var entity = getEntity(key);
if (entity == null || !replacePredicate.test(entity.getValue())) {
return false;
}
@ -98,7 +98,10 @@ public class JpaServerConfigStorageProvider implements ServerConfigStorageProvid
//no-op
}
private ServerConfigEntity getEntity(String key, LockModeType lockModeType) {
return entityManager.find(ServerConfigEntity.class, Objects.requireNonNull(key), lockModeType);
private ServerConfigEntity getEntity(String key) {
// Optimistic is enough to prevent the following scenario (copied from Javadoc):
// Transaction T1 reads a row. Another transaction T2 then modifies or deletes that row, before T1 has committed.
// Both transactions eventually commit successfully.
return entityManager.find(ServerConfigEntity.class, Objects.requireNonNull(key), LockModeType.OPTIMISTIC);
}
}

View File

@ -32,6 +32,7 @@ import org.keycloak.infinispan.module.certificates.CertificateReloadManager;
import org.keycloak.infinispan.module.certificates.JGroupsCertificateHolder;
import org.keycloak.infinispan.module.configuration.global.KeycloakConfigurationBuilder;
import org.keycloak.models.KeycloakSession;
import org.keycloak.models.KeycloakSessionFactory;
import org.keycloak.models.utils.KeycloakModelUtils;
import org.keycloak.storage.configuration.ServerConfigStorageProvider;
@ -52,8 +53,9 @@ public class JpaJGroupsTlsConfigurator extends BaseJGroupsTlsConfigurator {
private static final String TLS_PROTOCOL_VERSION = "TLSv1.3";
private static final String TLS_PROTOCOL = "TLS";
private static final int STARTUP_RETRIES = 2;
private static final int STARTUP_RETRY_SLEEP_MILLIS = 10;
// 2.5 seconds in the worst case. Unlikely to happen, except if the DB connection is unreliable.
private static final int STARTUP_RETRIES = 5;
private static final int STARTUP_RETRY_SLEEP_MILLIS = 500;
public static final JpaJGroupsTlsConfigurator INSTANCE = new JpaJGroupsTlsConfigurator();
@Override
@ -65,23 +67,26 @@ public class JpaJGroupsTlsConfigurator extends BaseJGroupsTlsConfigurator {
SocketFactory createSocketFactory(ConfigurationBuilderHolder holder, KeycloakSession session) {
var factory = session.getKeycloakSessionFactory();
var kcConfig = holder.getGlobalConfigurationBuilder().addModule(KeycloakConfigurationBuilder.class);
kcConfig.setKeycloakSessionFactory(factory);
var crtHolder = loadInitialCertificateWithRetry(factory);
kcConfig.setJGroupsCertificateRotation(requiredIntegerProperty(CachingOptions.CACHE_EMBEDDED_MTLS_ROTATION));
return Retry.call(iteration -> {
try {
var crtHolder = KeycloakModelUtils.runJobInTransactionWithResult(factory, this::createSocketFactoryInTransaction);
var sslContext = SSLContext.getInstance(TLS_PROTOCOL);
sslContext.init(new KeyManager[]{crtHolder.keyManager()}, new TrustManager[]{crtHolder.trustManager()}, null);
var sf = createFromContext(sslContext);
kcConfig.setJGroupCertificateHolder(crtHolder);
return sf;
} catch (KeyManagementException | NoSuchAlgorithmException e) {
throw new RuntimeException(e);
}
}, STARTUP_RETRIES, STARTUP_RETRY_SLEEP_MILLIS);
kcConfig.setKeycloakSessionFactory(factory);
kcConfig.setJGroupCertificateHolder(crtHolder);
try {
var sslContext = SSLContext.getInstance(TLS_PROTOCOL);
sslContext.init(new KeyManager[]{crtHolder.keyManager()}, new TrustManager[]{crtHolder.trustManager()}, null);
return createFromContext(sslContext);
} catch (KeyManagementException | NoSuchAlgorithmException e) {
// we should have valid certificates and keys.
throw new RuntimeException(e);
}
}
private JGroupsCertificateHolder createSocketFactoryInTransaction(KeycloakSession session) {
private static JGroupsCertificateHolder loadInitialCertificateWithRetry(KeycloakSessionFactory factory) {
return Retry.call(iteration -> KeycloakModelUtils.runJobInTransactionWithResult(factory, JpaJGroupsTlsConfigurator::createOrLoadCertificate), STARTUP_RETRIES, STARTUP_RETRY_SLEEP_MILLIS);
}
private static JGroupsCertificateHolder createOrLoadCertificate(KeycloakSession session) {
try {
var rotationDays = requiredIntegerProperty(CachingOptions.CACHE_EMBEDDED_MTLS_ROTATION);
var storage = session.getProvider(ServerConfigStorageProvider.class);