|
- <?php
-
- declare(strict_types=1);
-
- namespace App\Services;
-
- use RuntimeException;
-
- class GoogleSheetImportService
- {
- /**
- * @return array{id: string, sheets: list<array{gid: string, title: string}>}
- */
- public function sheets(string $url): array
- {
- $spreadsheetId = $this->spreadsheetId($url);
- $currentGid = $this->gidFromUrl($url);
- $sheets = [];
-
- // Strategy 1: v3 worksheets JSON feed — works for publicly published sheets.
- // Each entry's alternate link href contains the numeric gid.
- if ($sheets === []) {
- try {
- $feed = $this->fetch(
- 'https://spreadsheets.google.com/feeds/worksheets/'
- . rawurlencode($spreadsheetId) . '/public/basic?alt=json'
- );
- $sheets = $this->extractSheetsFromFeed($feed);
- } catch (\Throwable) {}
- }
-
- // Strategy 2: htmlview URL — serves rendered HTML with tab links for
- // sheets shared "anyone with the link can view".
- if ($sheets === []) {
- try {
- $html = $this->fetch(
- 'https://docs.google.com/spreadsheets/d/'
- . rawurlencode($spreadsheetId) . '/htmlview'
- );
- $sheets = $this->extractSheets($html);
- } catch (\Throwable) {}
- }
-
- // Strategy 3: edit URL JS-bootstrapped data — last resort.
- if ($sheets === []) {
- try {
- $html = $this->fetch(
- 'https://docs.google.com/spreadsheets/d/'
- . rawurlencode($spreadsheetId) . '/edit?usp=sharing'
- );
- $sheets = $this->extractSheets($html);
- } catch (\Throwable) {}
- }
-
- // Fallback: if we know the gid from the URL, return a labelled placeholder.
- if ($sheets === [] && $currentGid !== null) {
- $sheets[] = ['gid' => $currentGid, 'title' => 'Sheet ' . $currentGid];
- }
-
- if ($sheets === []) {
- $sheets[] = ['gid' => '0', 'title' => 'First sheet'];
- }
-
- return ['id' => $spreadsheetId, 'sheets' => $sheets];
- }
-
- /**
- * @return array{headers: list<string>, rows: list<array<string, string>>}
- */
- public function rows(string $url, string $gid): array
- {
- $spreadsheetId = $this->spreadsheetId($url);
- $csv = $this->fetch(sprintf(
- 'https://docs.google.com/spreadsheets/d/%s/export?format=csv&gid=%s',
- rawurlencode($spreadsheetId),
- rawurlencode($gid)
- ));
-
- return $this->parseCsv($csv);
- }
-
- public function spreadsheetId(string $url): string
- {
- $parts = parse_url($url);
- $host = strtolower((string) ($parts['host'] ?? ''));
-
- if (!in_array($host, ['docs.google.com', 'spreadsheets.google.com'], true)) {
- throw new RuntimeException('Enter a valid Google Sheets URL.');
- }
-
- $path = (string) ($parts['path'] ?? '');
- if (preg_match('#/spreadsheets/d/([a-zA-Z0-9_-]+)#', $path, $matches) !== 1) {
- throw new RuntimeException('The Google Sheets URL does not include a spreadsheet id.');
- }
-
- return $matches[1];
- }
-
- // ── Sheet extraction ──────────────────────────────────────────────────────
-
- /**
- * Parse the v3 JSON feed response.
- *
- * @return list<array{gid: string, title: string}>
- */
- private function extractSheetsFromFeed(string $json): array
- {
- $data = json_decode($json, true);
-
- if (!is_array($data) || !isset($data['feed']['entry'])) {
- return [];
- }
-
- $sheets = [];
-
- foreach ((array) $data['feed']['entry'] as $entry) {
- $title = (string) ($entry['title']['$t'] ?? '');
-
- if ($title === '') {
- continue;
- }
-
- // GID is embedded in the rel="alternate" link href as #gid=NNN or &gid=NNN
- $gid = null;
- foreach ((array) ($entry['link'] ?? []) as $link) {
- if (preg_match('/[#&]gid=(\d+)/', (string) ($link['href'] ?? ''), $m)) {
- $gid = $m[1];
- break;
- }
- }
-
- if ($gid !== null && $this->looksLikeSheet($gid, $title) && !isset($sheets[$gid])) {
- $sheets[$gid] = ['gid' => $gid, 'title' => $title];
- }
- }
-
- return array_values($sheets);
- }
-
- /**
- * Parse HTML from htmlview or edit URL for sheet tab data.
- *
- * @return list<array{gid: string, title: string}>
- */
- private function extractSheets(string $html): array
- {
- $sheets = [];
-
- // ── HTML tab patterns (htmlview format) ───────────────────────────────
- // Google renders tab links like:
- // <a href="#gid=123">Sheet Name</a>
- // <span data-id="123">Sheet Name</span>
- $htmlPatterns = [
- '/<[^>]+href=["\'][^"\']*[#&]gid=(\d+)["\'][^>]*>\s*(?:<[^>]+>\s*)*([^<]{1,100}?)\s*(?:<|$)/i',
- '/data-id=["\'](\d+)["\'][^>]*>\s*([^<]{1,100}?)\s*</i',
- ];
-
- foreach ($htmlPatterns as $pattern) {
- if (preg_match_all($pattern, $html, $matches, PREG_SET_ORDER) > 0) {
- foreach ($matches as $match) {
- $gid = $match[1];
- $title = trim(html_entity_decode($match[2], ENT_QUOTES | ENT_HTML5, 'UTF-8'));
- if ($this->looksLikeSheet($gid, $title) && !isset($sheets[$gid])) {
- $sheets[$gid] = ['gid' => $gid, 'title' => $title];
- }
- }
- }
- }
-
- if (!empty($sheets)) {
- return array_values($sheets);
- }
-
- // ── JavaScript JSON patterns (edit URL bootstrapped data) ─────────────
- // Distance increased to 600 chars to handle larger embedded JSON objects.
- $jsPatterns = [
- '/"gid"\s*:\s*(\d+).{0,600}?"name"\s*:\s*"((?:\\\\.|[^"\\\\])+)"/s',
- '/"name"\s*:\s*"((?:\\\\.|[^"\\\\])+)".{0,600}?"gid"\s*:\s*(\d+)/s',
- '/"gid"\s*:\s*(\d+).{0,600}?"title"\s*:\s*"((?:\\\\.|[^"\\\\])+)"/s',
- '/"title"\s*:\s*"((?:\\\\.|[^"\\\\])+)".{0,600}?"gid"\s*:\s*(\d+)/s',
- '/\[\s*(\d+)\s*,\s*"((?:\\\\.|[^"\\\\])+)"/s',
- ];
-
- foreach ($jsPatterns as $pattern) {
- if (preg_match_all($pattern, $html, $matches, PREG_SET_ORDER) > 0) {
- foreach ($matches as $match) {
- $first = (string) $match[1];
- $second = (string) $match[2];
- $gid = ctype_digit($first) ? $first : $second;
- $title = ctype_digit($first) ? $second : $first;
- $title = $this->decodeJsString($title);
-
- if (!$this->looksLikeSheet($gid, $title) || isset($sheets[$gid])) {
- continue;
- }
-
- $sheets[$gid] = ['gid' => $gid, 'title' => $title];
- }
-
- if (!empty($sheets)) {
- break;
- }
- }
- }
-
- return array_values($sheets);
- }
-
- private function looksLikeSheet(string $gid, string $title): bool
- {
- if ($gid === '' || !ctype_digit($gid) || $title === '' || strlen($title) > 120) {
- return false;
- }
-
- return !str_contains($title, '<')
- && !str_contains($title, '{')
- && !str_contains(strtolower($title), 'http');
- }
-
- private function decodeJsString(string $value): string
- {
- $decoded = json_decode('"' . str_replace('"', '\\"', $value) . '"', true);
-
- return is_string($decoded) ? $decoded : stripcslashes($value);
- }
-
- private function gidFromUrl(string $url): ?string
- {
- $fragment = parse_url($url, PHP_URL_FRAGMENT);
- $query = parse_url($url, PHP_URL_QUERY);
-
- foreach ([(string) $fragment, (string) $query] as $part) {
- if (preg_match('/(?:^|&)gid=(\d+)/', $part, $matches) === 1) {
- return $matches[1];
- }
- }
-
- return null;
- }
-
- // ── HTTP fetch ────────────────────────────────────────────────────────────
-
- private function fetch(string $url): string
- {
- $context = stream_context_create([
- 'http' => [
- 'method' => 'GET',
- 'timeout' => 15,
- 'ignore_errors' => true,
- 'follow_location' => true,
- 'max_redirects' => 5,
- 'header' => implode("\r\n", [
- 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
- 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- 'Accept-Language: en-US,en;q=0.5',
- ]) . "\r\n",
- ],
- ]);
-
- $content = @file_get_contents($url, false, $context);
-
- if (!is_string($content) || trim($content) === '') {
- throw new RuntimeException(
- 'Could not reach the Google Sheet. Check the URL and make sure the sheet is shared as "Anyone with the link can view".'
- );
- }
-
- // Google returns a sign-in page when the sheet requires authentication.
- // Detect this by looking for the login shell markers present in every
- // Google auth redirect (~9 KB of CSS/JS with no actual sheet data).
- if (
- str_contains($content, '.login,.request-storage-access') ||
- str_contains($content, 'ServiceLogin') ||
- str_contains($content, 'accounts.google.com/ServiceLogin')
- ) {
- throw new RuntimeException(
- 'Google returned a sign-in page. The spreadsheet must be shared as "Anyone with the link can view": '
- . 'open the sheet → File → Share → Change to "Anyone with the link" → Viewer.'
- );
- }
-
- return $content;
- }
-
- // ── CSV parsing ───────────────────────────────────────────────────────────
-
- /**
- * @return array{headers: list<string>, rows: list<array<string, string>>}
- */
- private function parseCsv(string $csv): array
- {
- $handle = fopen('php://temp', 'r+');
- if ($handle === false) {
- throw new RuntimeException('Unable to parse sheet data.');
- }
-
- fwrite($handle, $csv);
- rewind($handle);
-
- $headers = fgetcsv($handle);
- if ($headers === false) {
- fclose($handle);
- throw new RuntimeException('The selected sheet is empty.');
- }
-
- $headers = array_map(
- static fn($h): string => trim((string) $h, " \t\n\r\0\x0B\xEF\xBB\xBF"),
- $headers
- );
-
- $rows = [];
- while (($values = fgetcsv($handle)) !== false) {
- $row = [];
- $hasValue = false;
-
- foreach ($headers as $index => $header) {
- if ($header === '') {
- continue;
- }
-
- $value = trim((string) ($values[$index] ?? ''));
- $row[$header] = $value;
- $hasValue = $hasValue || $value !== '';
- }
-
- if ($hasValue) {
- $rows[] = $row;
- }
- }
-
- fclose($handle);
-
- return ['headers' => $headers, 'rows' => $rows];
- }
- }
|