From a50c8c626914ee362bdd4de13a768f769f990812 Mon Sep 17 00:00:00 2001 From: Brady Wetherington Date: Wed, 6 Nov 2024 14:30:46 +0000 Subject: [PATCH 1/3] Automatically detect character encoding of CSV files when processsing them to handle non-UTF-8 file types. Added a new test case and enhanced the test rigs to be able to write non-UTF-8 files. Final cleanup --- app/Importer/Importer.php | 18 +++++ composer.json | 2 + composer.lock | 67 ++++++++++++++++++- .../Importing/Api/ImportAssetsTest.php | 26 +++++++ tests/Support/Importing/FileBuilder.php | 10 ++- 5 files changed, 120 insertions(+), 3 deletions(-) diff --git a/app/Importer/Importer.php b/app/Importer/Importer.php index 907c8b72c..48cdc7266 100644 --- a/app/Importer/Importer.php +++ b/app/Importer/Importer.php @@ -13,6 +13,7 @@ use Illuminate\Support\Facades\Auth; use Illuminate\Support\Facades\DB; use League\Csv\Reader; use Illuminate\Support\Facades\Log; +use Onnov\DetectEncoding\EncodingDetector; abstract class Importer { @@ -124,11 +125,28 @@ abstract class Importer if (! ini_get('auto_detect_line_endings')) { ini_set('auto_detect_line_endings', '1'); } + $detector = new EncodingDetector(); + // By default the importer passes a url to the file. // However, for testing we also support passing a string directly if (is_file($file)) { + $file_contents = file_get_contents($file); // TODO - this loads up the file in memory! Which could be 'big' and thus, this could be 'bad' + } else { + $file_contents = $file; + } + $encoding = $detector->getEncoding($file_contents); + \Log::debug("DETECTED ENCODING IS: $encoding"); + $file_contents = null; //try to save some memory? + if (is_file($file)) { + if ($encoding && strcasecmp($encoding, 'UTF-8') != 0) { + $file = "php://filter/convert.iconv.$encoding.utf-8/resource=".$file; + } $this->csv = Reader::createFromPath($file); } else { + //we already have the string, so do the conversion directly here? + if ($encoding && strcasecmp($encoding, 'UTF-8') != 0) { + $file = iconv($encoding, 'UTF-8', $file); + } $this->csv = Reader::createFromString($file); } $this->tempPassword = substr(str_shuffle('0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'), 0, 40); diff --git a/composer.json b/composer.json index 865878280..3040dc424 100644 --- a/composer.json +++ b/composer.json @@ -20,6 +20,7 @@ "php": "^8.1", "ext-curl": "*", "ext-fileinfo": "*", + "ext-iconv": "*", "ext-json": "*", "ext-mbstring": "*", "ext-pdo": "*", @@ -55,6 +56,7 @@ "nunomaduro/collision": "^7.0", "okvpn/clock-lts": "^1.0", "onelogin/php-saml": "^3.4", + "onnov/detect-encoding": "^2.0", "osa-eg/laravel-teams-notification": "^2.1", "paragonie/constant_time_encoding": "^2.3", "paragonie/sodium_compat": "^1.19", diff --git a/composer.lock b/composer.lock index 17fe70c40..b8e67655d 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "0750e3a427347b2a56a05a8b9b533d48", + "content-hash": "2a6e7f5e039ee2f40605aefc5c5baf08", "packages": [ { "name": "alek13/slack", @@ -5574,6 +5574,70 @@ ], "time": "2024-05-30T15:14:26+00:00" }, + { + "name": "onnov/detect-encoding", + "version": "v2.0.0", + "source": { + "type": "git", + "url": "https://github.com/onnov/detect-encoding.git", + "reference": "6a8159ac3e6178ae043244b9d66a9b2701121e07" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/onnov/detect-encoding/zipball/6a8159ac3e6178ae043244b9d66a9b2701121e07", + "reference": "6a8159ac3e6178ae043244b9d66a9b2701121e07", + "shasum": "" + }, + "require": { + "ext-iconv": "*", + "php": ">=7.3" + }, + "require-dev": { + "infection/infection": "*", + "phpbench/phpbench": "*", + "phpcompatibility/php-compatibility": "*", + "phpmd/phpmd": "*", + "phpstan/phpstan": "*", + "phpstan/phpstan-strict-rules": "*", + "phpunit/phpunit": "*", + "roave/backward-compatibility-check": "*", + "squizlabs/php_codesniffer": "*" + }, + "type": "library", + "autoload": { + "psr-4": { + "Onnov\\DetectEncoding\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "onnov", + "email": "oblnn@yandex.ru" + } + ], + "description": "Text encoding definition class instead of mb_detect_encoding. Defines: utf-8, windows-1251, koi8-r, iso-8859-5, ibm866, .....", + "homepage": "https://github.com/onnov/detect-encoding", + "keywords": [ + "cyrillic", + "encoding", + "ibm866", + "iconv", + "iso-8859-5", + "koi8-r", + "mb_detect_encoding", + "utf-8", + "windows-1251" + ], + "support": { + "issues": "https://github.com/onnov/detect-encoding/issues", + "source": "https://github.com/onnov/detect-encoding/tree/v2.0.0" + }, + "time": "2021-01-04T14:29:34+00:00" + }, { "name": "osa-eg/laravel-teams-notification", "version": "v2.1.2", @@ -16570,6 +16634,7 @@ "php": "^8.1", "ext-curl": "*", "ext-fileinfo": "*", + "ext-iconv": "*", "ext-json": "*", "ext-mbstring": "*", "ext-pdo": "*" diff --git a/tests/Feature/Importing/Api/ImportAssetsTest.php b/tests/Feature/Importing/Api/ImportAssetsTest.php index 0f54b22e9..fdea4ca47 100644 --- a/tests/Feature/Importing/Api/ImportAssetsTest.php +++ b/tests/Feature/Importing/Api/ImportAssetsTest.php @@ -141,6 +141,32 @@ class ImportAssetsTest extends ImportDataTestCase implements TestsPermissionsReq } + #[Test] + public function importInternationalAsset(): void + { + $evil_string = 'blähÅÄÖ'; //'це; //first one is cyrllic? so is second. + $evil_string = 'це'; //cyrliccic - windows-1251 (ONE) + //copypasta the thing? well, the important bits? + $importFileBuilder = ImportFileBuilder::new(['itemName' => $evil_string]); //not 'name' + $row = $importFileBuilder->firstRow(); + $import = Import::factory()->asset()->create(['file_path' => $importFileBuilder->saveToImportsDirectory(null, 'WINDOWS-1251')]); + + $this->actingAsForApi(User::factory()->superuser()->create()); + $this->importFileResponse(['import' => $import->id]) + ->assertOk() + ->assertExactJson([ + 'payload' => null, + 'status' => 'success', + 'messages' => ['redirect_url' => route('hardware.index')] + ]); + + $newAsset = Asset::query() + ->with(['location', 'supplier', 'company', 'assignedAssets', 'defaultLoc', 'assetStatus', 'model.category', 'model.manufacturer']) + ->where('serial', $row['serialNumber']) + ->sole(); + + $this->assertEquals($evil_string, $newAsset->name); + } #[Test] public function willIgnoreUnknownColumnsWhenFileContainsUnknownColumns(): void { diff --git a/tests/Support/Importing/FileBuilder.php b/tests/Support/Importing/FileBuilder.php index fad40054b..bf08dc96d 100644 --- a/tests/Support/Importing/FileBuilder.php +++ b/tests/Support/Importing/FileBuilder.php @@ -206,7 +206,7 @@ abstract class FileBuilder * * @return string The filename. */ - public function saveToImportsDirectory(?string $filename = null): string + public function saveToImportsDirectory(?string $filename = null, ?string $locale = null): string { $filename ??= Str::random(40) . '.csv'; @@ -214,9 +214,15 @@ abstract class FileBuilder $stream = fopen(config('app.private_uploads') . "/imports/{$filename}", 'w'); foreach ($this->toCsv() as $row) { + if ($locale) { + $newrow = []; + foreach ($row as $index => $cell) { + $newrow[$index] = iconv('utf-8', $locale, (string) $cell); + } + $row = $newrow; + } fputcsv($stream, $row); } - return $filename; } finally { if (is_resource($stream)) { From d65206c7fed5e54bb6dd9e1a5834abcbb3163619 Mon Sep 17 00:00:00 2001 From: Brady Wetherington Date: Mon, 20 Jan 2025 13:49:42 +0000 Subject: [PATCH 2/3] Move tests to UI-side --- app/Http/Controllers/Api/ImportController.php | 32 +++++++++++++++++- app/Importer/Importer.php | 18 ---------- .../Importing/Api/ImportAssetsTest.php | 26 --------------- tests/Feature/Importing/Ui/ImportTest.php | 33 +++++++++++++++++++ 4 files changed, 64 insertions(+), 45 deletions(-) diff --git a/app/Http/Controllers/Api/ImportController.php b/app/Http/Controllers/Api/ImportController.php index ebf8b550b..b2954d38b 100644 --- a/app/Http/Controllers/Api/ImportController.php +++ b/app/Http/Controllers/Api/ImportController.php @@ -9,12 +9,14 @@ use App\Http\Transformers\ImportsTransformer; use App\Models\Asset; use App\Models\Company; use App\Models\Import; +use Illuminate\Http\UploadedFile; use Illuminate\Support\Facades\Artisan; use Illuminate\Database\Eloquent\JsonEncodingException; use Illuminate\Support\Facades\Request; use Illuminate\Support\Facades\Session; use Illuminate\Support\Facades\Storage; use League\Csv\Reader; +use Onnov\DetectEncoding\EncodingDetector; use Symfony\Component\HttpFoundation\File\Exception\FileException; use Illuminate\Support\Facades\Log; use Illuminate\Http\JsonResponse; @@ -45,6 +47,9 @@ class ImportController extends Controller $path = config('app.private_uploads').'/imports'; $results = []; $import = new Import; + $detector = new EncodingDetector(); + + \Log::error("Okay, do we have files? ".count($files)); foreach ($files as $file) { if (! in_array($file->getMimeType(), [ 'application/vnd.ms-excel', @@ -55,7 +60,7 @@ class ImportController extends Controller 'text/comma-separated-values', 'text/tsv', ])) { $results['error'] = 'File type must be CSV. Uploaded file is '.$file->getMimeType(); - + \Log::error("Bad mime type"); return response()->json(Helper::formatStandardApiResponse('error', null, $results['error']), 422); } @@ -63,11 +68,34 @@ class ImportController extends Controller if (! ini_get('auto_detect_line_endings')) { ini_set('auto_detect_line_endings', '1'); } + $file_contents = $file->getContent(); //TODO - this *does* load the whole file in RAM, but we need that to be able to 'iconv' it? + $encoding = $detector->getEncoding($file_contents); + $reader = null; + if (strcasecmp($encoding, 'UTF-8') != 0) { + \Log::error("Weird encoding detected: $encoding"); + $transliterated = iconv($encoding, 'UTF-8', $file_contents); + if ($transliterated !== false) { + \Log::error("Transliteration was successful."); + $tmpname = tempnam(sys_get_temp_dir(), ''); + $tmpresults = file_put_contents($tmpname, $transliterated); + if ($tmpresults !== false) { + \Log::error("Temporary file written to $tmpname"); + $transliterated = null; //save on memory? + $newfile = new UploadedFile($tmpname, $file->getClientOriginalName(), null, null, true); //WARNING: this is enabling 'test mode' - which is gross, but otherwise the file won't be treated as 'uploaded' + if ($newfile->isValid()) { + \Log::error("new UploadedFile was created"); + $file = $newfile; + } + } + } + } $reader = Reader::createFromFileObject($file->openFile('r')); //file pointer leak? + $file_contents = null; //try to save on memory, I guess? try { $import->header_row = $reader->fetchOne(0); } catch (JsonEncodingException $e) { + \Log::error("cant load header row?"); return response()->json( Helper::formatStandardApiResponse( 'error', @@ -94,6 +122,7 @@ class ImportController extends Controller } } if (count($duplicate_headers) > 0) { + \Log::error("Duplicate headers?"); return response()->json(Helper::formatStandardApiResponse('error', null, implode('; ', $duplicate_headers)),422); } @@ -101,6 +130,7 @@ class ImportController extends Controller // Grab the first row to display via ajax as the user picks fields $import->first_row = $reader->fetchOne(1); } catch (JsonEncodingException $e) { + \Log::error("JSON eocding reception?"); return response()->json( Helper::formatStandardApiResponse( 'error', diff --git a/app/Importer/Importer.php b/app/Importer/Importer.php index 48cdc7266..907c8b72c 100644 --- a/app/Importer/Importer.php +++ b/app/Importer/Importer.php @@ -13,7 +13,6 @@ use Illuminate\Support\Facades\Auth; use Illuminate\Support\Facades\DB; use League\Csv\Reader; use Illuminate\Support\Facades\Log; -use Onnov\DetectEncoding\EncodingDetector; abstract class Importer { @@ -125,28 +124,11 @@ abstract class Importer if (! ini_get('auto_detect_line_endings')) { ini_set('auto_detect_line_endings', '1'); } - $detector = new EncodingDetector(); - // By default the importer passes a url to the file. // However, for testing we also support passing a string directly if (is_file($file)) { - $file_contents = file_get_contents($file); // TODO - this loads up the file in memory! Which could be 'big' and thus, this could be 'bad' - } else { - $file_contents = $file; - } - $encoding = $detector->getEncoding($file_contents); - \Log::debug("DETECTED ENCODING IS: $encoding"); - $file_contents = null; //try to save some memory? - if (is_file($file)) { - if ($encoding && strcasecmp($encoding, 'UTF-8') != 0) { - $file = "php://filter/convert.iconv.$encoding.utf-8/resource=".$file; - } $this->csv = Reader::createFromPath($file); } else { - //we already have the string, so do the conversion directly here? - if ($encoding && strcasecmp($encoding, 'UTF-8') != 0) { - $file = iconv($encoding, 'UTF-8', $file); - } $this->csv = Reader::createFromString($file); } $this->tempPassword = substr(str_shuffle('0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'), 0, 40); diff --git a/tests/Feature/Importing/Api/ImportAssetsTest.php b/tests/Feature/Importing/Api/ImportAssetsTest.php index fdea4ca47..0f54b22e9 100644 --- a/tests/Feature/Importing/Api/ImportAssetsTest.php +++ b/tests/Feature/Importing/Api/ImportAssetsTest.php @@ -141,32 +141,6 @@ class ImportAssetsTest extends ImportDataTestCase implements TestsPermissionsReq } - #[Test] - public function importInternationalAsset(): void - { - $evil_string = 'blähÅÄÖ'; //'це; //first one is cyrllic? so is second. - $evil_string = 'це'; //cyrliccic - windows-1251 (ONE) - //copypasta the thing? well, the important bits? - $importFileBuilder = ImportFileBuilder::new(['itemName' => $evil_string]); //not 'name' - $row = $importFileBuilder->firstRow(); - $import = Import::factory()->asset()->create(['file_path' => $importFileBuilder->saveToImportsDirectory(null, 'WINDOWS-1251')]); - - $this->actingAsForApi(User::factory()->superuser()->create()); - $this->importFileResponse(['import' => $import->id]) - ->assertOk() - ->assertExactJson([ - 'payload' => null, - 'status' => 'success', - 'messages' => ['redirect_url' => route('hardware.index')] - ]); - - $newAsset = Asset::query() - ->with(['location', 'supplier', 'company', 'assignedAssets', 'defaultLoc', 'assetStatus', 'model.category', 'model.manufacturer']) - ->where('serial', $row['serialNumber']) - ->sole(); - - $this->assertEquals($evil_string, $newAsset->name); - } #[Test] public function willIgnoreUnknownColumnsWhenFileContainsUnknownColumns(): void { diff --git a/tests/Feature/Importing/Ui/ImportTest.php b/tests/Feature/Importing/Ui/ImportTest.php index 3493f47af..4b811c487 100644 --- a/tests/Feature/Importing/Ui/ImportTest.php +++ b/tests/Feature/Importing/Ui/ImportTest.php @@ -3,6 +3,8 @@ namespace Tests\Feature\Importing\Ui; use App\Models\User; +use Illuminate\Http\UploadedFile; +use PHPUnit\Framework\Attributes\Test; use Tests\TestCase; class ImportTest extends TestCase @@ -13,4 +15,35 @@ class ImportTest extends TestCase ->get(route('imports.index')) ->assertOk(); } + + public function testStoreInternationalAsset(): void + { + $evil_string = 'це'; //cyrillic - windows-1251 (ONE) + $csv_contents = "a,b,c\n$evil_string,$evil_string,$evil_string\n"; + + // now, deliberately transform our UTF-8 into Windows-1251 so we can test out the character-set detection + $transliterated_contents = iconv('UTF-8', 'WINDOWS-1251', $csv_contents); + //\Log::error("RAW TRANSLITERATED CONTENTS: $transliterated_contents"); // should show 'unicode missing glyph' symbol in logs. + + $this->actingAsForApi(User::factory()->superuser()->create()); + $results = $this->post(route('api.imports.store'), ['files' => [UploadedFile::fake()->createWithContent("myname.csv", $transliterated_contents)]]) + ->assertOk() + ->assertJsonStructure([ + "files" => [ + [ + "created_at", + "field_map", + "file_path", + "filesize", + "first_row", + "header_row", + "id", + "import_type", + "name", + ] + ] + ]); + \Log::error(print_r($results, true)); + $this->assertEquals($evil_string, $results->json()['files'][0]['first_row'][0]); + } } From bbb2af7f5349c6071cd94c82e8db074db3dc4805 Mon Sep 17 00:00:00 2001 From: Brady Wetherington Date: Mon, 20 Jan 2025 13:52:26 +0000 Subject: [PATCH 3/3] remove log messages --- app/Http/Controllers/Api/ImportController.php | 9 --------- 1 file changed, 9 deletions(-) diff --git a/app/Http/Controllers/Api/ImportController.php b/app/Http/Controllers/Api/ImportController.php index b2954d38b..2a4d91c3d 100644 --- a/app/Http/Controllers/Api/ImportController.php +++ b/app/Http/Controllers/Api/ImportController.php @@ -49,7 +49,6 @@ class ImportController extends Controller $import = new Import; $detector = new EncodingDetector(); - \Log::error("Okay, do we have files? ".count($files)); foreach ($files as $file) { if (! in_array($file->getMimeType(), [ 'application/vnd.ms-excel', @@ -60,7 +59,6 @@ class ImportController extends Controller 'text/comma-separated-values', 'text/tsv', ])) { $results['error'] = 'File type must be CSV. Uploaded file is '.$file->getMimeType(); - \Log::error("Bad mime type"); return response()->json(Helper::formatStandardApiResponse('error', null, $results['error']), 422); } @@ -72,18 +70,14 @@ class ImportController extends Controller $encoding = $detector->getEncoding($file_contents); $reader = null; if (strcasecmp($encoding, 'UTF-8') != 0) { - \Log::error("Weird encoding detected: $encoding"); $transliterated = iconv($encoding, 'UTF-8', $file_contents); if ($transliterated !== false) { - \Log::error("Transliteration was successful."); $tmpname = tempnam(sys_get_temp_dir(), ''); $tmpresults = file_put_contents($tmpname, $transliterated); if ($tmpresults !== false) { - \Log::error("Temporary file written to $tmpname"); $transliterated = null; //save on memory? $newfile = new UploadedFile($tmpname, $file->getClientOriginalName(), null, null, true); //WARNING: this is enabling 'test mode' - which is gross, but otherwise the file won't be treated as 'uploaded' if ($newfile->isValid()) { - \Log::error("new UploadedFile was created"); $file = $newfile; } } @@ -95,7 +89,6 @@ class ImportController extends Controller try { $import->header_row = $reader->fetchOne(0); } catch (JsonEncodingException $e) { - \Log::error("cant load header row?"); return response()->json( Helper::formatStandardApiResponse( 'error', @@ -122,7 +115,6 @@ class ImportController extends Controller } } if (count($duplicate_headers) > 0) { - \Log::error("Duplicate headers?"); return response()->json(Helper::formatStandardApiResponse('error', null, implode('; ', $duplicate_headers)),422); } @@ -130,7 +122,6 @@ class ImportController extends Controller // Grab the first row to display via ajax as the user picks fields $import->first_row = $reader->fetchOne(1); } catch (JsonEncodingException $e) { - \Log::error("JSON eocding reception?"); return response()->json( Helper::formatStandardApiResponse( 'error',