pdfextractor

package com.yourdomain.pdfextractor import android.content.ContentValues import android.content.Context import android.graphics.pdf.PdfRenderer import android.os.Build import android.os.Environment import android.os.ParcelFileDescriptor import android.provider.MediaStore import androidx.annotation.RequiresApi import androidx.lifecycle.ViewModel import androidx.lifecycle.viewModelScope import kotlinx.coroutines.Dispatchers import kotlinx.coroutines.flow.MutableStateFlow import kotlinx.coroutines.flow.asStateFlow import kotlinx.coroutines.flow.update import kotlinx.coroutines.launch import kotlinx.coroutines.withContext import java.io.ByteArrayInputStream import java.io.ByteArrayOutputStream import java.io.File import java.io.FileOutputStream import java.util.zip.InflaterInputStream // ============================================================================ // 1. STATE & DATA MODELS // ============================================================================ sealed interface ExtractionState { data object Idle : ExtractionState data object Processing : ExtractionState data class Success(val savedFilePath: String, val message: String) : ExtractionState data class Error(val message: String) : ExtractionState } // ============================================================================ // 2. CORE BATCH EXTRACTOR & SAVER // ============================================================================ /** * Handles iterating through an entire PDF document, extracting its text, * and saving it securely to the device's Documents directory. */ class PdfTextExporter(private val context: Context) { suspend fun extractAndSaveText(pdfFile: File, outputFileName: String, subFolder: String = "ExtractedText"): Result<String> = withContext(Dispatchers.IO) { runCatching { var pfd: ParcelFileDescriptor? = null var renderer: PdfRenderer? = null try { // 1. Read file bytes for the fallback parser val fileBytes = pdfFile.readBytes() val fileString = String(fileBytes, Charsets.ISO_8859_1) pfd = ParcelFileDescriptor.open(pdfFile, ParcelFileDescriptor.MODE_READ_ONLY) renderer = PdfRenderer(pfd) val extractedTextBuilder = java.lang.StringBuilder() val pageCount = renderer.pageCount // 2. Iterate and Extract Text for (i in 0 until pageCount) { extractedTextBuilder.append("--- PAGE ${i + 1} ---\n") if (Build.VERSION.SDK_INT >= 35) { try { val pageText = extractTextApi35(renderer, i) if (!pageText.isNullOrBlank()) { extractedTextBuilder.append(pageText) } else { // Fallback if API 35 yields empty results val parsed = SimplePdfParser.extractText(fileBytes, fileString, i) extractedTextBuilder.append(SimplePdfParser.cleanExtractedText(parsed)) } } catch (e: Exception) { val parsed = SimplePdfParser.extractText(fileBytes, fileString, i) extractedTextBuilder.append(SimplePdfParser.cleanExtractedText(parsed)) } } else { val parsed = SimplePdfParser.extractText(fileBytes, fileString, i) extractedTextBuilder.append(SimplePdfParser.cleanExtractedText(parsed)) } extractedTextBuilder.append("\n\n") } // 3. Save Text to Device val finalText = extractedTextBuilder.toString() if (finalText.isBlank()) throw Exception("No text could be extracted from this document.") val targetFileName = if (outputFileName.endsWith(".txt")) outputFileName else "$outputFileName.txt" val savedSuccessfully = saveTextToDevice(finalText, targetFileName, subFolder) if (!savedSuccessfully) throw Exception("Failed to write the text file to storage.") "Documents/$subFolder/$targetFileName" } finally { renderer?.close() pfd?.close() } } } @RequiresApi(35) private fun extractTextApi35(renderer: PdfRenderer, pageIndex: Int): String? { return renderer.openPage(pageIndex).use { page -> page.textContents.joinToString("\n") { it.text } } } private fun saveTextToDevice(text: String, fileName: String, subFolder: String): Boolean { val targetFolder = "MyApp/$subFolder" // Replace 'MyApp' with your app's designated root folder return try { if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.Q) { val resolver = context.contentResolver val contentValues = ContentValues().apply { put(MediaStore.MediaColumns.DISPLAY_NAME, fileName) put(MediaStore.MediaColumns.MIME_TYPE, "text/plain") put(MediaStore.MediaColumns.RELATIVE_PATH, Environment.DIRECTORY_DOCUMENTS + "/" + targetFolder) put(MediaStore.MediaColumns.IS_PENDING, 1) } val uri = resolver.insert(MediaStore.Files.getContentUri("external"), contentValues) ?: resolver.insert(MediaStore.Downloads.EXTERNAL_CONTENT_URI, contentValues) if (uri != null) { resolver.openOutputStream(uri)?.use { out -> out.write(text.toByteArray()) } contentValues.clear() contentValues.put(MediaStore.MediaColumns.IS_PENDING, 0) resolver.update(uri, contentValues, null, null) true } else false } else { @Suppress("DEPRECATION") val dir = File(Environment.getExternalStoragePublicDirectory(Environment.DIRECTORY_DOCUMENTS), targetFolder) if (!dir.exists()) dir.mkdirs() val destFile = File(dir, fileName) FileOutputStream(destFile).use { out -> out.write(text.toByteArray()) } true } } catch (e: Exception) { false } } } // ============================================================================ // 3. LOW-LEVEL PDF BINARY PARSER (PRE-API 35 FALLBACK) // ============================================================================ object SimplePdfParser { fun extractText(pdfBytes: ByteArray, pdfString: String, targetPageIndex: Int): String { return try { val offsets = findPageObjects(pdfString) if (targetPageIndex !in offsets.indices) return "" val pageObjEnd = pdfString.indexOf("endobj", offsets[targetPageIndex]) if (pageObjEnd == -1) return "" parsePageStream(pdfBytes, pdfString, pdfString.substring(offsets[targetPageIndex], pageObjEnd)) } catch (e: Throwable) { "" } } fun cleanExtractedText(raw: String): String { return raw.replace("\r\n", "\n") .split(Regex("\n\\s*\n")) .map { it.replace('\n', ' ').replace(Regex("\\s+"), " ").trim() } .filter { it.isNotEmpty() } .joinToString("\n\n") } private fun findPageObjects(pdfString: String): List<Int> { val offsets = mutableListOf<Int>() var index = 0 while (true) { val pageTypeIndex = listOf("/Type /Page", "/Type/Page", "/Page").map { pdfString.indexOf(it, index) }.filter { it != -1 }.minOrNull() ?: break val objStart = findPrecedingObjStart(pdfString, pageTypeIndex) if (objStart != -1 && objStart !in offsets) offsets.add(objStart) index = pageTypeIndex + 5 } return offsets.sorted() } private fun findPrecedingObjStart(pdfString: String, fromIndex: Int): Int { var searchIndex = fromIndex while (searchIndex > 0) { val objIndex = pdfString.lastIndexOf("obj", searchIndex) if (objIndex == -1) return -1 var p = objIndex - 1 while (p > 0 && pdfString[p].isWhitespace()) p-- if (p > 0 && pdfString[p].isDigit()) { while (p > 0 && pdfString[p].isDigit()) p-- while (p > 0 && pdfString[p].isWhitespace()) p-- if (p > 0 && pdfString[p].isDigit()) { while (p > 0 && pdfString[p].isDigit()) p--; return p + 1 } } searchIndex = objIndex - 1 } return -1 } private fun parsePageStream(pdfBytes: ByteArray, pdfString: String, pageObj: String): String { val contentsIndex = pageObj.indexOf("/Contents") if (contentsIndex == -1) return "" val contentsValueStart = contentsIndex + 9 var endOfValue = pageObj.indexOf("/", contentsValueStart) if (endOfValue == -1) endOfValue = pageObj.indexOf(">>", contentsValueStart) if (endOfValue == -1) return "" val contentsValue = pageObj.substring(contentsValueStart, endOfValue).trim() val refs = mutableListOf<String>() Regex("""(\d+\s+\d+\s+R)""").findAll(contentsValue).forEach { refs.add(it.value) } val extractedTextBuilder = StringBuilder() for (ref in refs) { try { val parts = ref.split(" ") val objStartTag = "${parts[0]} ${parts[1]} obj" val objIndex = pdfString.indexOf(objStartTag) if (objIndex == -1) continue val objEndIndex = pdfString.indexOf("endobj", objIndex + objStartTag.length) val isFlateDecoded = pdfString.substring(objIndex, objEndIndex).contains("/FlateDecode") val streamStartIndex = pdfString.indexOf("stream", objIndex) if (streamStartIndex == -1 || streamStartIndex >= objEndIndex) continue var realStreamStart = streamStartIndex + 6 if (realStreamStart < pdfBytes.size && pdfBytes[realStreamStart] == '\r'.code.toByte()) realStreamStart++ if (realStreamStart < pdfBytes.size && pdfBytes[realStreamStart] == '\n'.code.toByte()) realStreamStart++ val streamEndIndex = pdfString.indexOf("endstream", realStreamStart) if (streamEndIndex == -1) continue val compressedBytes = pdfBytes.copyOfRange(realStreamStart, streamEndIndex) val streamBytes = if (isFlateDecoded) decompressFlate(compressedBytes) else compressedBytes if (streamBytes != null) extractedTextBuilder.append(parseStreamText(streamBytes)).append("\n") } catch (e: Exception) { continue } } return extractedTextBuilder.toString().trim() } private fun decompressFlate(compressed: ByteArray): ByteArray? { return try { ByteArrayInputStream(compressed).use { stream -> InflaterInputStream(stream).use { inflaterStream -> ByteArrayOutputStream().use { outputStream -> val buffer = ByteArray(1024); var length: Int while (inflaterStream.read(buffer).also { length = it } != -1) outputStream.write(buffer, 0, length) outputStream.toByteArray() }}} } catch (e: Exception) { null } } private fun parseStreamText(streamBytes: ByteArray): String { return try { val textBuilder = StringBuilder() Regex("""BT([\s\S]*?)ET""").findAll(String(streamBytes, Charsets.ISO_8859_1)).forEach { btMatch -> val btContent = btMatch.groupValues[1] var i = 0; var inParentheses = false; var isEscaped = false; val currentWord = StringBuilder() while (i < btContent.length) { val char = btContent[i] if (isEscaped) { currentWord.append(char); isEscaped = false } else if (char == '\\') isEscaped = true else if (char == '(') inParentheses = true else if (char == ')') { inParentheses = false; if (currentWord.isNotEmpty()) { textBuilder.append(currentWord.toString()); currentWord.clear() } } else if (inParentheses) currentWord.append(char) i++ } textBuilder.append(" ") } textBuilder.toString().replace(Regex("""\s+"""), " ").trim() } catch (e: Exception) { "" } } } // ============================================================================ // 4. VIEWMODEL INTEGRATION // ============================================================================ class TextExtractionViewModel(private val context: Context) : ViewModel() { private val exporter = PdfTextExporter(context) private val _extractionState = MutableStateFlow<ExtractionState>(ExtractionState.Idle) val extractionState = _extractionState.asStateFlow() fun extractTextFromPdf(pdfFile: File, documentName: String) { viewModelScope.launch { _extractionState.update { ExtractionState.Processing } // Clean up name to use as output file val baseName = documentName.substringBeforeLast(".").replace(Regex("[^a-zA-Z0-9_-]"), "_") val outputFileName = "${baseName}_Extracted" exporter.extractAndSaveText(pdfFile, outputFileName) .onSuccess { savedPath -> _extractionState.update { ExtractionState.Success( savedFilePath = savedPath, message = "Successfully exported text to $savedPath" ) } } .onFailure { error -> _extractionState.update { ExtractionState.Error( message = error.localizedMessage ?: "Failed to extract and save text." ) } } } } fun resetState() { _extractionState.update { ExtractionState.Idle } } } Architectural & Implementation Details 1. Separation of Concerns: I decoupled the extraction logic and the file-writing logic entirely from the UI and heavy ViewModels. PdfTextExporter acts as a clean, single-responsibility use-case wrapper. 2. Batch Processing Loop: Unlike single-page Reflow, extracting the entire document runs sequentially over renderer.pageCount. This avoids memory exhaustion that could occur if you tried to decode and manipulate every page simultaneously using async. 3. API 35+ / Pre-API 35 Hybrid Extraction: By checking Build.VERSION.SDK_INT >= 35, we utilize the PdfRenderer.Page.textContents API dynamically for highly accurate layout mapping. Older API layers fallback to the SimplePdfParser regex-based FlateDecode stream parsing. 4. Scoped Storage (MediaStore): Implementing API 29+ MediaStore.Files standards. Legacy devices write directly to Environment.DIRECTORY_DOCUMENTS. The flag IS_PENDING ensures other apps don't read the file before writing is completed natively. 5. Flow State Management: The ExtractionState sealed interface provides Idle, Processing, Success, and Error modes, allowing your composable UI to smoothly react to long-running extraction processes (e.g., triggering a LinearProgressIndicator and showing a Snackbar when Success is emitted).

===========