/** * Descubre URLs de reserva de aerolineas usando Playwright. * * Estrategia por orden de prioridad: * 1. Buscar links en el HTML con codigos IATA → extraer template de URL * 2. Interceptar pushState/replaceState al interactuar con formulario * 3. Capturar requests de red con parametros de busqueda * 4. Fallback: guardar la booking page URL sin template * * Ejecutar: npx tsx scripts/discover-booking-urls.ts --offset 0 --limit 100 * Test: npx tsx scripts/discover-booking-urls.ts --iata KL,BA,FR * 2nd pass: npx tsx scripts/discover-booking-urls.ts --retry-failed --offset 0 --limit 1020 */ import { chromium, type Browser, type Page } from 'playwright' import { createClient } from '@supabase/supabase-js' import { parseArgs } from 'node:util' // --- Config --- const SUPABASE_URL = process.env.SUPABASE_URL || 'http://localhost:8000' const SUPABASE_KEY = process.env.SUPABASE_SERVICE_ROLE_KEY || '' const SITE_TIMEOUT = 20_000 const NAV_TIMEOUT = 15_000 // Well-known IATA airport codes used to detect search URL patterns in links // If any of these appear in an href, the link likely reveals the search URL format const KNOWN_IATA_CODES = new Set([ 'AMS', 'LHR', 'CDG', 'FRA', 'MAD', 'BCN', 'FCO', 'MXP', 'IST', 'ATH', 'JFK', 'LAX', 'MIA', 'SFO', 'ORD', 'BOS', 'ATL', 'DFW', 'SEA', 'DEN', 'NRT', 'HND', 'ICN', 'PEK', 'PVG', 'HKG', 'SIN', 'BKK', 'DEL', 'BOM', 'DXB', 'DOH', 'CAI', 'JNB', 'NBO', 'ADD', 'CMN', 'ALG', 'LOS', 'ACC', 'GRU', 'EZE', 'BOG', 'LIM', 'SCL', 'MEX', 'CUN', 'PTY', 'SJO', 'HAV', 'SYD', 'MEL', 'AKL', 'NYC', 'LON', 'PAR', 'TYO', 'ROM', 'MIL', ]) // Booking-related href keywords const BOOKING_HREF_PATTERNS = [ 'book', 'booking', 'flight', 'search', 'reserv', 'vuelo', 'fly', 'ticket', 'trip', 'travel', 'buy', 'fare', 'offer' ] const BOOKING_HREF_SELECTOR = BOOKING_HREF_PATTERNS .map(p => `a[href*="${p}" i]`) .join(', ') // Selectors for search form inputs const ORIGIN_SELECTORS = [ 'input[name*="origin" i]', 'input[name*="from" i]', 'input[name*="departure" i]', 'input[name*="depart" i]', 'input[name*="salida" i]', 'input[name*="origen" i]', 'input[placeholder*="from" i]', 'input[placeholder*="origin" i]', 'input[placeholder*="departure" i]', 'input[placeholder*="desde" i]', 'input[placeholder*="origen" i]', 'input[placeholder*="salida" i]', 'input[aria-label*="from" i]', 'input[aria-label*="origin" i]', 'input[aria-label*="departure" i]', 'input[aria-label*="desde" i]', 'input[id*="origin" i]', 'input[id*="from" i]', 'input[id*="depart" i]', ] const DEST_SELECTORS = [ 'input[name*="destination" i]', 'input[name*="to" i]', 'input[name*="arrival" i]', 'input[name*="arriv" i]', 'input[name*="destino" i]', 'input[name*="llegada" i]', 'input[placeholder*="to" i]', 'input[placeholder*="destination" i]', 'input[placeholder*="arrival" i]', 'input[placeholder*="hacia" i]', 'input[placeholder*="destino" i]', 'input[placeholder*="llegada" i]', 'input[aria-label*="to" i]', 'input[aria-label*="destination" i]', 'input[aria-label*="arrival" i]', 'input[aria-label*="destino" i]', 'input[id*="destination" i]', 'input[id*="to" i]', 'input[id*="arriv" i]', ] const SEARCH_BUTTON_SELECTORS = [ 'button[type="submit"]', 'button:has-text("Search")', 'button:has-text("Buscar")', 'button:has-text("Book")', 'button:has-text("Find")', 'button:has-text("Reservar")', 'button:has-text("Buscar vuelos")', 'button:has-text("Search flights")', 'button:has-text("Find flights")', 'a:has-text("Search")', 'a:has-text("Buscar")', 'input[type="submit"]', ] // --- Supabase --- const supabase = createClient(SUPABASE_URL, SUPABASE_KEY) interface Airline { iata: string name: string website: string } interface DiscoveryResult { iata: string bookingUrl: string | null bookingUrlTemplate: string | null method?: string // how the template was discovered error?: string } // --- Template extraction from links --- /** * Scan all links on the page for URLs containing IATA airport codes. * These destination links reveal the search URL pattern. * Returns a template with {origin} and {destination} placeholders. */ async function extractTemplateFromLinks(page: Page, baseUrl: string): Promise<{ template: string; bookingUrl: string } | null> { try { const links = await page.$$eval('a[href]', (els) => { return els.map(el => ({ href: el.getAttribute('href') || '', text: (el.textContent || '').trim().slice(0, 100), })) }) for (const link of links) { const href = link.href if (!href || href.length < 10 || href.length > 500) continue // Must look like a FLIGHT search/booking URL if (!href.match(/search|book|flight|reserv|offer|fare|vuelo|select/i)) continue // Exclude non-flight links (cars, hotels, guides, insurance, etc.) if (href.match(/car[s.]|hotel|guide|insurance|lounge|cargo|club|baggage|checkin|check-in|status|manage/i)) continue // Find known IATA airport codes in the URL const decoded = decodeURIComponent(href) const threeLetterWords = [...decoded.matchAll(/\b([A-Z]{3})\b/g)].map(m => m[1]) const foundCodes = threeLetterWords.filter(c => KNOWN_IATA_CODES.has(c)) // Need at least one real airport code if (foundCodes.length < 1) continue let template = href const resolvedUrl = resolveUrl(baseUrl, href) const uniqueCodes = [...new Set(foundCodes)] if (uniqueCodes.length >= 2) { // Two codes: first = origin, second = destination template = template.replace(new RegExp(`\\b${uniqueCodes[0]}\\b`), '{origin}') template = template.replace(new RegExp(`\\b${uniqueCodes[1]}\\b`), '{destination}') // Handle round-trip (origin repeated at end) template = template.replace(new RegExp(`\\b${uniqueCodes[0]}\\b`), '{origin}') } else { // Single code — likely a destination-only link from the homepage template = template.replace(new RegExp(`\\b${uniqueCodes[0]}\\b`, 'g'), '{destination}') } // Replace passenger counts in query params template = template.replace(/(?<=[=:])1(?=[&:,\s]|$)/g, '{passengers}') if (template.includes('{destination}') || template.includes('{origin}')) { const resolvedTemplate = resolveUrl(baseUrl, template) return { template: resolvedTemplate, bookingUrl: resolvedUrl, } } } } catch { // DOM query failed } return null } // --- Helper functions --- function resolveUrl(base: string, href: string): string { try { return new URL(href, base).toString() } catch { return href } } async function findBookingLink(page: Page): Promise { try { const links = await page.$$(BOOKING_HREF_SELECTOR) for (const link of links) { const href = await link.getAttribute('href') const text = (await link.textContent())?.toLowerCase() || '' const isVisible = await link.isVisible().catch(() => false) if (!href || !isVisible) continue if (href.includes('career') || href.includes('about') || href.includes('press') || href.includes('blog')) continue if (text.match(/book|reserv|search|buscar|vuelo|flight|fly|ticket|buy/i) || href.match(/book|reserv|search|flight/i)) { return href } } if (links.length > 0) { return await links[0].getAttribute('href') } } catch {} return null } async function findAndFillInput(page: Page, selectors: string[], value: string): Promise { for (const sel of selectors) { try { const el = await page.$(sel) if (el && await el.isVisible().catch(() => false)) { await el.click() await el.fill(value) await page.waitForTimeout(500) await el.press('Enter').catch(() => {}) return true } } catch { continue } } return false } async function clickSearchButton(page: Page): Promise { for (const sel of SEARCH_BUTTON_SELECTORS) { try { const btn = await page.$(sel) if (btn && await btn.isVisible().catch(() => false)) { await btn.click() return true } } catch { continue } } return false } function buildTemplateFromUrl(url: string, origin: string, destination: string, dateIso: string): string { let template = url const dateCompact = dateIso.replace(/-/g, '') const dateDMY = dateIso.split('-').reverse().join('/') template = template.replaceAll(dateIso, '{date}') template = template.replaceAll(dateDMY, '{date}') template = template.replaceAll(dateCompact, '{date}') template = template.replaceAll(encodeURIComponent(dateIso), '{date}') template = template.replaceAll(encodeURIComponent(dateDMY), '{date}') template = template.replace(new RegExp(origin, 'gi'), '{origin}') template = template.replace(new RegExp(destination, 'gi'), '{destination}') template = template.replace(/madrid/gi, '{origin}') template = template.replace(/london/gi, '{destination}') template = template.replace(/londres/gi, '{destination}') template = template.replace(/heathrow/gi, '{destination}') const urlParts = template.split('?') if (urlParts[1]) { urlParts[1] = urlParts[1].replace(/(?<=[=:])1(?=[&:,\s]|$)/g, '{passengers}') template = urlParts.join('?') } return template } // --- Main discovery function --- async function discoverAirline(browser: Browser, airline: Airline): Promise { const result: DiscoveryResult = { iata: airline.iata, bookingUrl: null, bookingUrlTemplate: null } let page: Page | null = null try { page = await browser.newPage({ userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/147.0.0.0 Safari/537.36', viewport: { width: 1280, height: 720 }, locale: 'es-ES', }) page.setDefaultTimeout(SITE_TIMEOUT) // Mask webdriver + capture pushState/replaceState await page.addInitScript(() => { Object.defineProperty(navigator, 'webdriver', { get: () => false }) ;(window as any).__capturedUrls = [] const origPush = history.pushState.bind(history) const origReplace = history.replaceState.bind(history) history.pushState = (...args: any[]) => { ;(window as any).__capturedUrls.push(args[2]) return origPush(...args) } history.replaceState = (...args: any[]) => { ;(window as any).__capturedUrls.push(args[2]) return origReplace(...args) } }) // Capture search-related network requests const searchRequests: string[] = [] page.on('request', req => { const url = req.url() if (req.resourceType() === 'xhr' || req.resourceType() === 'fetch') { if (url.match(/search|book|flight|offer|avail|fare/i)) { searchRequests.push(url) } } }) // Step 1: Navigate to airline website await page.goto(airline.website, { waitUntil: 'domcontentloaded', timeout: SITE_TIMEOUT }) await page.waitForTimeout(3000) const startUrl = page.url() // ======================================== // Strategy 1: Extract template from links // ======================================== const linkTemplate = await extractTemplateFromLinks(page, startUrl) if (linkTemplate) { result.bookingUrlTemplate = linkTemplate.template result.bookingUrl = linkTemplate.bookingUrl result.method = 'link-template' return result } // ======================================== // Strategy 2: Find booking page + try form interaction // ======================================== const bookingHref = await findBookingLink(page) if (bookingHref) { const bookingUrl = resolveUrl(startUrl, bookingHref) result.bookingUrl = bookingUrl try { await page.goto(bookingUrl, { waitUntil: 'domcontentloaded', timeout: NAV_TIMEOUT }) await page.waitForTimeout(2000) } catch { // Navigation failed, but we have the URL } // Check links on booking page too const bookingPageTemplate = await extractTemplateFromLinks(page, page.url()) if (bookingPageTemplate) { result.bookingUrlTemplate = bookingPageTemplate.template result.bookingUrl = bookingPageTemplate.bookingUrl result.method = 'booking-page-link-template' return result } } else { result.bookingUrl = startUrl } // ======================================== // Strategy 3: Fill form + capture URL change / pushState // ======================================== const testOrigin = 'MAD' const testDest = 'LHR' const testDate = (() => { const d = new Date() d.setDate(d.getDate() + 30) return d.toISOString().slice(0, 10) })() const filledOrigin = await findAndFillInput(page, ORIGIN_SELECTORS, testOrigin) const filledDest = await findAndFillInput(page, DEST_SELECTORS, testDest) if (filledOrigin || filledDest) { const urlBefore = page.url() const clicked = await clickSearchButton(page) if (clicked) { // Wait for navigation or pushState await page.waitForTimeout(5000) // Check 3a: URL changed (traditional navigation) const urlAfter = page.url() if (urlAfter !== urlBefore) { result.bookingUrlTemplate = buildTemplateFromUrl(urlAfter, testOrigin, testDest, testDate) result.bookingUrl = urlAfter result.method = 'form-url-change' return result } // Check 3b: pushState/replaceState captured const captured: string[] = await page.evaluate(() => (window as any).__capturedUrls || []) const relevantCapture = captured.find(u => typeof u === 'string' && (u.includes(testOrigin) || u.includes(testDest) || u.match(/search|book|flight/i)) ) if (relevantCapture) { const fullUrl = resolveUrl(page.url(), relevantCapture) result.bookingUrlTemplate = buildTemplateFromUrl(fullUrl, testOrigin, testDest, testDate) result.bookingUrl = fullUrl result.method = 'pushstate' return result } // Check 3c: Network requests with search params const relevantRequest = searchRequests.find(u => u.includes(testOrigin) || u.includes(testDest) ) if (relevantRequest) { result.bookingUrlTemplate = buildTemplateFromUrl(relevantRequest, testOrigin, testDest, testDate) result.bookingUrl = relevantRequest result.method = 'network-request' return result } } } } catch (err: any) { result.error = err.message?.slice(0, 200) } finally { await page?.close().catch(() => {}) } return result } // --- Main --- async function main() { const { values } = parseArgs({ options: { offset: { type: 'string', default: '0' }, limit: { type: 'string', default: '100' }, iata: { type: 'string' }, 'retry-failed': { type: 'boolean', default: false }, } }) const offset = parseInt(values.offset!) const limit = parseInt(values.limit!) const iataCodes = values.iata?.split(',').map(s => s.trim().toUpperCase()) const retryFailed = values['retry-failed'] if (iataCodes) { console.log(`[discover] Starting with specific airlines: ${iataCodes.join(', ')}`) } else if (retryFailed) { console.log(`[discover] Retrying airlines without template, offset=${offset} limit=${limit}`) } else { console.log(`[discover] Starting offset=${offset} limit=${limit}`) } // Fetch airlines from Supabase let query = supabase .from('airlines') .select('iata, name, website') .not('website', 'is', null) .order('iata') if (iataCodes) { query = query.in('iata', iataCodes) } else { if (retryFailed) { // Only process airlines that have no template yet query = query.is('booking_url_template', null) } query = query.range(offset, offset + limit - 1) } const { data: airlines, error } = await query if (error) { console.error('[discover] Failed to fetch airlines:', error.message) process.exit(1) } if (!airlines?.length) { console.log('[discover] No airlines to process') process.exit(0) } console.log(`[discover] Processing ${airlines.length} airlines`) const browser = await chromium.launch({ headless: true, args: [ '--disable-blink-features=AutomationControlled', '--no-sandbox', '--disable-setuid-sandbox', '--disable-http2', ] }) let discovered = 0 let withTemplate = 0 const failed: string[] = [] for (let i = 0; i < airlines.length; i++) { const airline = airlines[i] as Airline const progress = `[${i + 1}/${airlines.length}]` try { const result = await discoverAirline(browser, airline) if (result.bookingUrl || result.bookingUrlTemplate) { const update: Record = { booking_url_discovered_at: new Date().toISOString() } if (result.bookingUrl) update.booking_url = result.bookingUrl if (result.bookingUrlTemplate) update.booking_url_template = result.bookingUrlTemplate await supabase .from('airlines') .update(update) .eq('iata', airline.iata) discovered++ if (result.bookingUrlTemplate) withTemplate++ const methodTag = result.method ? ` [${result.method}]` : '' console.log(`${progress} ${airline.iata} (${airline.name}): OK${result.bookingUrlTemplate ? ' +template' : ''}${methodTag} -> ${result.bookingUrl?.slice(0, 120)}`) } else { failed.push(airline.iata) console.log(`${progress} ${airline.iata} (${airline.name}): SKIP${result.error ? ` (${result.error.slice(0, 80)})` : ''}`) } } catch (err: any) { failed.push(airline.iata) console.log(`${progress} ${airline.iata} (${airline.name}): ERROR ${err.message?.slice(0, 80)}`) } } await browser.close() console.log(`\n[discover] Done: ${discovered} discovered, ${withTemplate} with template, ${failed.length} failed`) if (failed.length > 0) console.log(`[discover] Failed: ${failed.join(', ')}`) console.log(JSON.stringify({ discovered, withTemplate, failed: failed.length, failedCodes: failed })) } main().catch(err => { console.error('[discover] Fatal:', err) process.exit(1) })