respect-validation/src-dev/Commands/UpdateDomainSuffixesCommand.php
Henrique Moody 7892a7c902
Port Bash scripts to PHP
It makes more sense to use PHP to generate PHP code than to use Bash. I
love writing Bash scripts, but I know it's not for everyone, and they
can become quite complex. Porting them to PHP code also lowers the
barrier for people to change them.

While I was making those changes, I also noticed a problem with how we
save the domain suffixes. We're converting all of them to ASCII, so we
are not preserving languages such as Chinese, Thai, and Hebrew, which
use non-ASCII characters.
2026-01-06 10:06:22 +01:00

207 lines
5.7 KiB
PHP

<?php
/*
* Copyright (c) Alexandre Gomes Gaigalas <alganet@gmail.com>
* SPDX-License-Identifier: MIT
*/
declare(strict_types=1);
namespace Respect\Dev\Commands;
use Symfony\Component\Console\Attribute\AsCommand;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Output\OutputInterface;
use Symfony\Component\Console\Style\SymfonyStyle;
use Symfony\Component\VarExporter\VarExporter;
use function array_keys;
use function array_unique;
use function count;
use function dirname;
use function explode;
use function file_get_contents;
use function file_put_contents;
use function glob;
use function implode;
use function is_dir;
use function mb_strtoupper;
use function mkdir;
use function preg_match;
use function rmdir;
use function sort;
use function sprintf;
use function str_replace;
use function str_starts_with;
use function trim;
use function unlink;
use const PHP_EOL;
#[AsCommand(
name: 'update:domain-suffixes',
description: 'Update list of public domain suffixes',
)]
final class UpdateDomainSuffixesCommand extends Command
{
private const string LIST_URL = 'https://publicsuffix.org/list/public_suffix_list.dat';
protected function execute(InputInterface $input, OutputInterface $output): int
{
$io = new SymfonyStyle($input, $output);
$io->title('Updating domain suffixes');
// Download the list
$io->section('Downloading list');
$io->text(sprintf('Fetching from: %s', self::LIST_URL));
$listContent = file_get_contents(self::LIST_URL);
if ($listContent === false) {
$io->error('Failed to download public suffix list');
return Command::FAILURE;
}
$io->success('Downloaded successfully');
// Clean old data
$io->section('Removing old data');
$dataDir = dirname(__DIR__, 2) . '/data/domain';
$this->removeDirectory($dataDir . '/public-suffix');
if (!is_dir($dataDir)) {
mkdir($dataDir, 0777, true);
}
mkdir($dataDir . '/public-suffix', 0777, true);
$io->success('Old data removed');
// Process the list
$io->section('Processing public suffix list');
$suffixes = $this->parseTldsList($listContent);
$tlds = array_unique(array_keys($suffixes));
sort($tlds);
$io->text(sprintf('Found %d TLDs with suffixes', count($tlds)));
// Create files
$io->section('Creating suffix files');
$progressBar = $io->createProgressBar(count($tlds));
$progressBar->start();
foreach ($tlds as $tld) {
$suffixList = $suffixes[$tld];
if ($suffixList === []) {
$progressBar->advance();
continue;
}
sort($suffixList);
$fileContent = implode(PHP_EOL, [
'<?php declare(strict_types=1);',
'// Copyright (c) https://publicsuffix.org',
'// SPDX-License-Identifier: MPL-2.0-no-copyleft-exception',
'return ' . VarExporter::export($suffixList) . ';' . PHP_EOL,
]);
// Convert IDN TLD to ASCII (Punycode) for filename
$filename = sprintf('%s/public-suffix/%s.php', $dataDir, $tld);
file_put_contents($filename, $fileContent);
$progressBar->advance();
}
$progressBar->finish();
$io->newLine(2);
$io->success('Domain suffixes updated successfully');
return Command::SUCCESS;
}
/** @return array<string, array<string>> */
private function parseTldsList(string $content): array
{
$lines = explode("\n", $content);
$suffixes = [];
$icannOnly = true;
foreach ($lines as $line) {
$line = trim($line);
// Check if we've reached the end of ICANN domains
if ($line === '// ===END ICANN DOMAINS===') {
$icannOnly = false;
}
// Skip comments and empty lines
if ($line === '' || str_starts_with($line, '//')) {
continue;
}
// Process the suffix
$suffix = $line;
// Remove wildcards and exceptions
$suffix = str_replace('*.', '', $suffix);
$suffix = str_replace('!', '', $suffix);
// Convert to uppercase (using multibyte for international characters)
$suffix = mb_strtoupper($suffix, 'UTF-8');
// Split into TLD and prefix
if (!preg_match('/^([^.]+)$|^(.+)\.([^.]+)$/', $suffix, $matches)) {
continue;
}
if (isset($matches[3])) {
// Has a prefix
$tld = $matches[3];
$prefix = $matches[2];
if (!isset($suffixes[$tld])) {
$suffixes[$tld] = [];
}
// Only add ICANN domains
if ($icannOnly) {
$suffixes[$tld][] = $prefix . '.' . $tld;
}
} else {
// Just a TLD
$tld = $matches[1];
if (!isset($suffixes[$tld])) {
$suffixes[$tld] = [];
}
}
}
return $suffixes;
}
private function removeDirectory(string $directory): void
{
if (!is_dir($directory)) {
return;
}
$files = glob($directory . '/*');
if ($files === false) {
return;
}
foreach ($files as $file) {
if (is_dir($file)) {
$this->removeDirectory($file);
} else {
@unlink($file);
}
}
@rmdir($directory);
}
}