pdfreflow
===
package com.yourdomain.pdfreflow
import android.graphics.pdf.PdfRenderer
import android.os.Build
import android.os.ParcelFileDescriptor
import androidx.annotation.RequiresApi
import androidx.compose.foundation.background
import androidx.compose.foundation.layout.Box
import androidx.compose.foundation.layout.Column
import androidx.compose.foundation.layout.fillMaxSize
import androidx.compose.foundation.layout.fillMaxWidth
import androidx.compose.foundation.layout.padding
import androidx.compose.foundation.rememberScrollState
import androidx.compose.foundation.text.selection.SelectionContainer
import androidx.compose.foundation.verticalScroll
import androidx.compose.material3.MaterialTheme
import androidx.compose.material3.Surface
import androidx.compose.material3.Text
import androidx.compose.runtime.Composable
import androidx.compose.runtime.LaunchedEffect
import androidx.compose.runtime.getValue
import androidx.compose.ui.Modifier
import androidx.compose.ui.graphics.Color
import androidx.compose.ui.text.AnnotatedString
import androidx.compose.ui.text.SpanStyle
import androidx.compose.ui.text.buildAnnotatedString
import androidx.compose.ui.unit.TextUnit
import androidx.compose.ui.unit.dp
import androidx.compose.ui.unit.sp
import androidx.lifecycle.ViewModel
import androidx.lifecycle.compose.collectAsStateWithLifecycle
import androidx.lifecycle.viewModelScope
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.flow.MutableStateFlow
import kotlinx.coroutines.flow.asStateFlow
import kotlinx.coroutines.flow.update
import kotlinx.coroutines.launch
import kotlinx.coroutines.sync.Mutex
import kotlinx.coroutines.sync.withLock
import kotlinx.coroutines.withContext
import java.io.ByteArrayInputStream
import java.io.ByteArrayOutputStream
import java.io.File
import java.util.zip.InflaterInputStream
// ============================================================================
// 1. DATA MODELS & STATE
// ============================================================================
data class ReflowUiState(
val isLoading: Boolean = false,
val text: String? = null,
val errorMessage: String? = null
)
// ============================================================================
// 2. CORE EXTRACTOR & PARSER
// ============================================================================
/**
* Handles text extraction using API 35+ PdfRenderer features natively,
* with a custom binary parser fallback for devices running API 24-34.
*/
class PdfTextExtractor(private val file: File) {
private val mutex = Mutex()
private var cachedPdfBytes: ByteArray? = null
private var cachedPdfString: String? = null
suspend fun extractTextFromPage(pageIndex: Int): Result<String> = withContext(Dispatchers.IO) {
mutex.withLock {
runCatching {
var pfd: ParcelFileDescriptor? = null
var renderer: PdfRenderer? = null
try {
pfd = ParcelFileDescriptor.open(file, ParcelFileDescriptor.MODE_READ_ONLY)
renderer = PdfRenderer(pfd)
if (pageIndex < 0 || pageIndex >= renderer.pageCount) {
throw IllegalArgumentException("Page index out of bounds")
}
// Attempt API 35+ native extraction first
if (Build.VERSION.SDK_INT >= 35) {
val nativeText = extractTextApi35(renderer, pageIndex)
if (!nativeText.isNullOrBlank()) {
return@runCatching nativeText
}
}
// Fallback for API < 35 or if native extraction yielded empty results
val (pdfBytes, pdfString) = getCachedPdfData(file)
val parsedText = SimplePdfParser.extractText(pdfBytes, pdfString, pageIndex)
val cleanedText = SimplePdfParser.cleanExtractedText(parsedText)
cleanedText.ifBlank { "No extractable content found on this page." }
} finally {
renderer?.close()
pfd?.close()
}
}
}
}
@RequiresApi(35)
private fun extractTextApi35(renderer: PdfRenderer, pageIndex: Int): String? {
return try {
renderer.openPage(pageIndex).use { page ->
page.textContents.joinToString("\n") { it.text }
}
} catch (e: Exception) {
null
}
}
private fun getCachedPdfData(file: File): Pair<ByteArray, String> {
if (cachedPdfBytes != null && cachedPdfString != null) {
return Pair(cachedPdfBytes!!, cachedPdfString!!)
}
val bytes = file.readBytes()
val str = String(bytes, Charsets.ISO_8859_1)
cachedPdfBytes = bytes
cachedPdfString = str
return Pair(bytes, str)
}
}
/**
* Pre-API 35 fallback text parser. Scans PDF binary streams, decompresses FlateDecode
* content, and parses raw text blocks.
*/
object SimplePdfParser {
fun extractText(pdfBytes: ByteArray, pdfString: String, targetPageIndex: Int): String {
return try {
val offsets = findPageObjects(pdfString)
if (targetPageIndex !in offsets.indices) return ""
val pageObjEnd = pdfString.indexOf("endobj", offsets[targetPageIndex])
if (pageObjEnd == -1) return ""
parsePageStream(pdfBytes, pdfString, pdfString.substring(offsets[targetPageIndex], pageObjEnd))
} catch (e: Throwable) {
""
}
}
fun cleanExtractedText(raw: String): String {
return raw.replace("\r\n", "\n")
.split(Regex("\n\\s*\n"))
.map { it.replace('\n', ' ').replace(Regex("\\s+"), " ").trim() }
.filter { it.isNotEmpty() }
.joinToString("\n\n")
}
private fun findPageObjects(pdfString: String): List<Int> {
val offsets = mutableListOf<Int>()
var index = 0
while (true) {
val pageTypeIndex = listOf("/Type /Page", "/Type/Page", "/Page")
.map { pdfString.indexOf(it, index) }
.filter { it != -1 }
.minOrNull() ?: break
val objStart = findPrecedingObjStart(pdfString, pageTypeIndex)
if (objStart != -1 && objStart !in offsets) offsets.add(objStart)
index = pageTypeIndex + 5
}
return offsets.sorted()
}
private fun findPrecedingObjStart(pdfString: String, fromIndex: Int): Int {
var searchIndex = fromIndex
while (searchIndex > 0) {
val objIndex = pdfString.lastIndexOf("obj", searchIndex)
if (objIndex == -1) return -1
var p = objIndex - 1
while (p > 0 && pdfString[p].isWhitespace()) p--
if (p > 0 && pdfString[p].isDigit()) {
while (p > 0 && pdfString[p].isDigit()) p--
while (p > 0 && pdfString[p].isWhitespace()) p--
if (p > 0 && pdfString[p].isDigit()) {
while (p > 0 && pdfString[p].isDigit()) p--
return p + 1
}
}
searchIndex = objIndex - 1
}
return -1
}
private fun parsePageStream(pdfBytes: ByteArray, pdfString: String, pageObj: String): String {
val contentsIndex = pageObj.indexOf("/Contents")
if (contentsIndex == -1) return ""
val contentsValueStart = contentsIndex + 9
var endOfValue = pageObj.indexOf("/", contentsValueStart)
if (endOfValue == -1) endOfValue = pageObj.indexOf(">>", contentsValueStart)
if (endOfValue == -1) return ""
val contentsValue = pageObj.substring(contentsValueStart, endOfValue).trim()
val refs = mutableListOf<String>()
Regex("""(\d+\s+\d+\s+R)""").findAll(contentsValue).forEach { refs.add(it.value) }
val extractedTextBuilder = StringBuilder()
for (ref in refs) {
try {
val parts = ref.split(" ")
val objStartTag = "${parts[0]} ${parts[1]} obj"
val objIndex = pdfString.indexOf(objStartTag)
if (objIndex == -1) continue
val objEndIndex = pdfString.indexOf("endobj", objIndex + objStartTag.length)
val isFlateDecoded = pdfString.substring(objIndex, objEndIndex).contains("/FlateDecode")
val streamStartIndex = pdfString.indexOf("stream", objIndex)
if (streamStartIndex == -1 || streamStartIndex >= objEndIndex) continue
var realStreamStart = streamStartIndex + 6
if (realStreamStart < pdfBytes.size && pdfBytes[realStreamStart] == '\r'.code.toByte()) realStreamStart++
if (realStreamStart < pdfBytes.size && pdfBytes[realStreamStart] == '\n'.code.toByte()) realStreamStart++
val streamEndIndex = pdfString.indexOf("endstream", realStreamStart)
if (streamEndIndex == -1) continue
val compressedBytes = pdfBytes.copyOfRange(realStreamStart, streamEndIndex)
val streamBytes = if (isFlateDecoded) decompressFlate(compressedBytes) else compressedBytes
if (streamBytes != null) {
extractedTextBuilder.append(parseStreamText(streamBytes)).append("\n")
}
} catch (e: Exception) {
continue
}
}
return extractedTextBuilder.toString().trim()
}
private fun decompressFlate(compressed: ByteArray): ByteArray? {
return try {
ByteArrayInputStream(compressed).use { stream ->
InflaterInputStream(stream).use { inflaterStream ->
ByteArrayOutputStream().use { outputStream ->
val buffer = ByteArray(1024)
var length: Int
while (inflaterStream.read(buffer).also { length = it } != -1) {
outputStream.write(buffer, 0, length)
}
outputStream.toByteArray()
}
}
}
} catch (e: Exception) {
null
}
}
private fun parseStreamText(streamBytes: ByteArray): String {
return try {
val textBuilder = StringBuilder()
Regex("""BT([\s\S]*?)ET""").findAll(String(streamBytes, Charsets.ISO_8859_1)).forEach { btMatch ->
val btContent = btMatch.groupValues[1]
var i = 0
var inParentheses = false
var isEscaped = false
val currentWord = StringBuilder()
while (i < btContent.length) {
val char = btContent[i]
if (isEscaped) {
currentWord.append(char)
isEscaped = false
} else if (char == '\\') {
isEscaped = true
} else if (char == '(') {
inParentheses = true
} else if (char == ')') {
inParentheses = false
if (currentWord.isNotEmpty()) {
textBuilder.append(currentWord.toString())
currentWord.clear()
}
} else if (inParentheses) {
currentWord.append(char)
}
i++
}
textBuilder.append(" ")
}
textBuilder.toString().replace(Regex("""\s+"""), " ").trim()
} catch (e: Exception) {
""
}
}
}
// ============================================================================
// 3. VIEWMODEL
// ============================================================================
class ReflowViewModel(private val file: File) : ViewModel() {
private val extractor = PdfTextExtractor(file)
private val _uiState = MutableStateFlow(ReflowUiState())
val uiState = _uiState.asStateFlow()
fun loadPageText(pageIndex: Int) {
viewModelScope.launch {
_uiState.update { it.copy(isLoading = true, errorMessage = null) }
extractor.extractTextFromPage(pageIndex)
.onSuccess { extractedText ->
_uiState.update { it.copy(isLoading = false, text = extractedText) }
}
.onFailure { exception ->
_uiState.update {
it.copy(isLoading = false, errorMessage = exception.localizedMessage ?: "Unknown error occurred")
}
}
}
}
}
// ============================================================================
// 4. COMPOSE UI COMPONENT
// ============================================================================
@Composable
fun PdfTextReflowScreen(
viewModel: ReflowViewModel,
pageIndex: Int,
fontSize: TextUnit = 16.sp,
lineHeightMultiplier: Float = 1.55f,
searchQuery: String = "",
modifier: Modifier = Modifier
) {
val uiState by viewModel.uiState.collectAsStateWithLifecycle()
LaunchedEffect(pageIndex) {
viewModel.loadPageText(pageIndex)
}
Surface(
modifier = modifier.fillMaxSize(),
color = MaterialTheme.colorScheme.background
) {
when {
uiState.isLoading -> {
Box(
modifier = Modifier.fillMaxSize(),
contentAlignment = androidx.compose.ui.Alignment.Center
) {
androidx.compose.material3.CircularProgressIndicator()
}
}
uiState.errorMessage != null -> {
Box(
modifier = Modifier.fillMaxSize().padding(32.dp),
contentAlignment = androidx.compose.ui.Alignment.Center
) {
Text(
text = uiState.errorMessage!!,
color = MaterialTheme.colorScheme.error,
style = MaterialTheme.typography.bodyLarge
)
}
}
uiState.text != null -> {
val annotatedString = buildSearchAnnotatedString(uiState.text!!, searchQuery)
Column(
modifier = Modifier
.fillMaxSize()
.padding(horizontal = 16.dp, vertical = 8.dp)
.verticalScroll(rememberScrollState())
) {
SelectionContainer {
Text(
text = annotatedString,
style = MaterialTheme.typography.bodyLarge.copy(
fontSize = fontSize,
lineHeight = fontSize * lineHeightMultiplier
),
color = MaterialTheme.colorScheme.onBackground,
modifier = Modifier.fillMaxWidth()
)
}
}
}
}
}
}
/**
* Utility to highlight search queries directly in the reflow text.
*/
@Composable
private fun buildSearchAnnotatedString(text: String, query: String): AnnotatedString {
if (query.isBlank()) return AnnotatedString(text)
return buildAnnotatedString {
append(text)
var startIndex = text.indexOf(query, ignoreCase = true)
while (startIndex >= 0) {
val endIndex = startIndex + query.length
addStyle(
style = SpanStyle(
background = Color(0xFFFFEB3B).copy(alpha = 0.5f),
color = Color.Black
),
start = startIndex,
end = endIndex
)
startIndex = text.indexOf(query, startIndex + 1, ignoreCase = true)
}
}
}
===
Architectural & Implementation Details
Isolation Strategy: I decoupled SimplePdfParser and the native extraction logic entirely from PdfViewerViewModel and isolated it into a clean, reusable PdfTextExtractor class. This class takes a standard java.io.File, meaning you can reuse it across any part of your new app regardless of Jetpack Navigation or Context limits.
API Fallback Mechanism: Starting with Android 15 (API 35), PdfRenderer.Page.textContents handles text extraction beautifully natively. For API 24-34, I retained your robust SimplePdfParser which natively analyzes PDF binary streams, bypasses FlateDecode compression, and locates /Contents dictionary tags.
Concurrency & Thread Safety: Mutex locks are used inside PdfTextExtractor to guarantee thread safety when calling ParcelFileDescriptor.open, since PdfRenderer isn't entirely thread-safe inherently on older OS versions. All parsing operations strictly run on Dispatchers.IO.
UI Reflow Integration (: The Compose layer handles the dynamic sizing via parameters fontSize and lineHeightMultiplier. I integrated SelectionContainer so the users can copy text. Added a utility function (buildSearchAnnotatedString) to automatically highlight search queries using AnnotatedString when reading the reflowed output. ===========