lix-doc: update dependencies and refactor

This updates the version of rnix used and refactors the code generally to be more precise and capable in it's identification of both lambdas and determining which documentation comments are attached. Change-Id: Ib0dddabd71f772c95077f9d7654023b37a7a1fd2
2024-06-18 16:24:49 -06:00 · 2024-06-18 16:24:49 -06:00 · 41963df4a5
commit 41963df4a5
parent 6e0ca02425
6 changed files with 395 additions and 247 deletions
--- a/lix-doc/Cargo.lock
+++ b/lix-doc/Cargo.lock
@ -9,13 +9,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"

 [[package]]
-name = "cbitset"
-version = "0.2.0"
+name = "countme"
+version = "3.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "29b6ad25ae296159fb0da12b970b2fe179b234584d7cd294c891e2bbb284466b"
-dependencies = [
- "num-traits",
-]
+checksum = "7704b5fdd17b18ae31c4c1da5a2e0305a2bf17b5249300a9ee9ed7b72114c636"

 [[package]]
 name = "dissimilar"
@ -33,19 +30,26 @@ dependencies = [
 "once_cell",
 ]

+[[package]]
+name = "hashbrown"
+version = "0.14.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
+
 [[package]]
 name = "lix-doc"
 version = "0.0.1"
 dependencies = [
 "expect-test",
 "rnix",
+ "rowan",
 ]

 [[package]]
-name = "num-traits"
-version = "0.2.18"
+name = "memoffset"
+version = "0.9.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a"
+checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a"
 dependencies = [
 "autocfg",
 ]
@ -56,44 +60,26 @@ version = "1.19.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"

-[[package]]
-name = "proc-macro2"
-version = "1.0.79"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e835ff2298f5721608eb1a980ecaee1aef2c132bf95ecc026a11b7bf3c01c02e"
-dependencies = [
- "unicode-ident",
-]
-
-[[package]]
-name = "quote"
-version = "1.0.35"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef"
-dependencies = [
- "proc-macro2",
-]
-
 [[package]]
 name = "rnix"
-version = "0.8.1"
+version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0a9b645f0edba447dbfc6473dd22999f46a1d00ab39e777a2713a1cf34a1597b"
+checksum = "bb35cedbeb70e0ccabef2a31bcff0aebd114f19566086300b8f42c725fc2cb5f"
 dependencies = [
- "cbitset",
 "rowan",
 ]

 [[package]]
 name = "rowan"
-version = "0.9.1"
+version = "0.15.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1ea7cadf87a9d8432e85cb4eb86bd2e765ace60c24ef86e79084dcae5d1c5a19"
+checksum = "32a58fa8a7ccff2aec4f39cc45bf5f985cec7125ab271cf681c279fd00192b49"
 dependencies = [
+ "countme",
+ "hashbrown",
+ "memoffset",
 "rustc-hash",
- "smol_str",
- "text_unit",
- "thin-dst",
+ "text-size",
 ]

 [[package]]
@ -103,59 +89,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"

 [[package]]
-name = "serde"
-version = "1.0.197"
+name = "text-size"
+version = "1.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3fb1c873e1b9b056a4dc4c0c198b24c3ffa059243875552b2bd0933b1aee4ce2"
-dependencies = [
- "serde_derive",
-]
-
-[[package]]
-name = "serde_derive"
-version = "1.0.197"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]
-
-[[package]]
-name = "smol_str"
-version = "0.1.24"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fad6c857cbab2627dcf01ec85a623ca4e7dcb5691cbaa3d7fb7653671f0d09c9"
-dependencies = [
- "serde",
-]
-
-[[package]]
-name = "syn"
-version = "2.0.53"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7383cd0e49fff4b6b90ca5670bfd3e9d6a733b3f90c686605aa7eec8c4996032"
-dependencies = [
- "proc-macro2",
- "quote",
- "unicode-ident",
-]
-
-[[package]]
-name = "text_unit"
-version = "0.1.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "20431e104bfecc1a40872578dbc390e10290a0e9c35fffe3ce6f73c15a9dbfc2"
-
-[[package]]
-name = "thin-dst"
-version = "1.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "db3c46be180f1af9673ebb27bc1235396f61ef6965b3fe0dbb2e624deb604f0e"
-
-[[package]]
-name = "unicode-ident"
-version = "1.0.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
+checksum = "f18aa187839b2bdb1ad2fa35ead8c4c2976b64e4363c386d45ac0f7ee85c9233"
--- a/lix-doc/Cargo.toml
+++ b/lix-doc/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 description = "Nix function documentation tool, stripped down into a library"
-edition = "2018"
+edition = "2021"
 name = "lix-doc"
 version = "0.0.1"
 license = "BSD-2-Clause OR MIT"
@ -12,7 +12,9 @@ repository = "https://github.com/lf-/nix-doc"
 crate_type = ["staticlib"]

 [dependencies]
-rnix = "0.8.0"
+rnix = "0.11.0"
+# Necessary because rnix fails to export a critical trait (Rowan's AstNode).
+rowan = "0.15.0"

 [dev-dependencies]
 expect-test = "1.1.0"
--- a/lix-doc/src/lib.rs
+++ b/lix-doc/src/lib.rs
@ -1,5 +1,5 @@
 // SPDX-FileCopyrightText: 2024 Jade Lovelace
-//
+// SPDX-FileCopyrightText: 2024 Lunaphied
 // SPDX-License-Identifier: BSD-2-Clause OR MIT

 //! library components of nix-doc
@ -7,13 +7,16 @@ pub mod pprint;

 use crate::pprint::pprint_args;

-use rnix::types::{Lambda, TypedNode};
-use rnix::SyntaxKind::*;
-use rnix::{NodeOrToken, SyntaxNode, TextUnit, WalkEvent};
+use rnix::ast::{self, Lambda};
+use rnix::{NodeOrToken, SyntaxKind};
+use rnix::SyntaxNode;
+
+
+// Needed because rnix fucked up and didn't reexport this, oops.
+use rowan::ast::AstNode;

 use std::ffi::{CStr, CString};
 use std::fs;
-use std::iter;
 use std::os::raw::c_char;
 use std::panic;

@ -23,66 +26,104 @@ use std::{fmt::Display, str};

 pub type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>;

-const DOC_INDENT: usize = 3;
-
 struct SearchResult {
    /// Name of the function
    identifier: String,

-    /// Dedented documentation comments
+    /// Dedented documentation comment
    doc: String,

    /// Parameter block for the function
    param_block: String,
 }

-fn find_pos(file: &str, line: usize, col: usize) -> usize {
-    let mut lines = 1;
-    let mut line_start = 0;
-    let mut it = file.chars().enumerate().peekable();
-    while let Some((count, ch)) = it.next() {
-        if ch == '\n' || ch == '\r' {
-            lines += 1;
-            let addend = if ch == '\r' && it.peek().map(|x| x.1) == Some('\n') {
-                it.next();
-                1
-            } else {
-                0
-            };
-            line_start = count + addend;
-        }
-
-        let col_diff = ((count as i32) - (line_start as i32)).abs() as usize;
-        if lines == line && col_diff == col {
-            return count;
-        }
-    }
-    unreachable!();
-}
-
 impl SearchResult {
    fn format<P: Display>(&self, filename: P, line: usize) -> String {
        format!(
            "**Synopsis:** `{}` = {}\n\n{}\n\n# {}",
            self.identifier.as_str(),
            self.param_block,
-            indented(&self.doc, DOC_INDENT),
+            self.doc,
            format!("{}:{}", filename, line).as_str(),
        )
    }
 }

-/// Emits a string `s` indented by `indent` spaces
-fn indented(s: &str, indent: usize) -> String {
-    let indent_s = iter::repeat(' ').take(indent).collect::<String>();
-    s.split('\n')
-        .map(|line| indent_s.clone() + line)
-        .collect::<Vec<_>>()
-        .join("\n")
+/// Converts Nix compatible line endings (Nix accepts `\r`, `\n`, *and* `\r\n` as endings), to
+/// standard `\n` endings for use within Rust land.
+fn convert_endings(s: &str) -> String {
+    let mut out = String::with_capacity(s.len());
+    let mut it = s.chars().peekable();
+
+    while let Some(ch) = it.next() {
+        if ch == '\n' || ch == '\r' {
+            out.push('\n');
+            if ch == '\r' && it.peek().map(|&c| c == '\n').unwrap_or(false) {
+                // Consume `\n` in `\r\n`.
+                it.next();
+            }
+        } else {
+            out.push(ch);
+        }
+    }
+
+    out
+}
+
+/// Converts the position information from Lix itself into an character index into the file itself.
+/// Expects an input string that's already had it's line endings normalized.
+///
+/// Note that this returns a *byte* offset, not a character offset.
+fn find_pos(s: &str, line: usize, col: usize) -> usize {
+    // Nix line positions are 1-indexed.
+    let mut lines = 1;
+    for (byte_pos, ch) in s.char_indices() {
+        // If we find a newline, increase the line count.
+        if ch == '\n' {
+            lines += 1;
+        }
+
+        // We've arrived at the correct line.
+        if lines == line {
+            // Column position is 1-indexed, and it's a *byte* offset, because Nix doesn't actually
+            // support UTF-8. Rust does though, so we need to convert to a proper byte index to
+            // match rnix. Lix also doesn't consider the line endings part of the column offset so
+            // we implicitly add one to advance to the character *after* that.
+            return byte_pos + col;
+        }
+    }
+
+    // If things never match that should be literally impossible.
+    unreachable!();
+}
+
+/// Represents a forwarded token from rnix's AST over to lix-doc.
+#[derive(Debug, Clone)]
+enum DocToken {
+    Comment(String),
+    Whitespace(String),
+}
+
+/// Determine if a given token string contains more than two newlines, this is used to determine when
+/// we hit blank lines between comments indicating a contextually unrelated comment.
+fn has_empty_line(tok: &DocToken) -> bool {
+    // It's either solely whitespace with two newlines inside somewhere, or it's
+    // contained inside a comment token and we don't want to count that as empty.
+    if let DocToken::Whitespace(s) = tok {
+        s.chars().filter(|&c| c == '\n').take(2).count() == 2
+    } else {
+        false
+    }
 }

 /// Cleans up a single line, erasing prefix single line comments but preserving indentation
-fn cleanup_single_line<'a>(s: &'a str) -> &'a str {
+// NOTE: We have a bit of a conflict of interest problem here due to the inconsistent format of
+// doc comments. Some doc comments will use a series of single line comments that may then contain `*`
+// characters to represent a list. Some will be multiline comments that don't prefix individual lines
+// with `*`, only using them for lists directly, and some will prefix lines with `*` as a leading
+// character to mark the block. There's no way to disambiguate all three, but we do our best to
+// make the common case pretty.
+fn cleanup_single_line(s: &str) -> &str {
    let mut cmt_new_start = 0;
    let mut iter = s.char_indices().peekable();
    while let Some((idx, ch)) = iter.next() {
@ -90,7 +131,9 @@ fn cleanup_single_line<'a>(s: &'a str) -> &'a str {
        let (_, next_ch) = iter.peek().unwrap_or(&(0, '\n'));

        // if we find a character, save the byte position after it as our new string start
-        if ch == '#' || (ch == '*' && next_ch.is_whitespace()) {
+        // This has special handling for `>` because some Nixpkgs documentation has `*>` right
+        // after the start of their doc comments, and we want to strip the `*` still.
+        if ch == '#' || (ch == '*' && (*next_ch == '>' || next_ch.is_whitespace())) {
            cmt_new_start = idx + 1;
            break;
        }
@ -103,15 +146,12 @@ fn cleanup_single_line<'a>(s: &'a str) -> &'a str {
    &s[cmt_new_start..]
 }

-/// Erases indents in comments. This is *almost* a normal dedent function, but it starts by looking
-/// at the second line if it can.
+/// Erases indents in comments based on the indentation of the first line.
 fn dedent_comment(s: &str) -> String {
    let mut whitespaces = 0;
-    let mut lines = s.lines();
-    let first = lines.next();

    // scan for whitespace
-    for line in lines.chain(first) {
+    for line in s.lines() {
        let line_whitespace = line.chars().take_while(|ch| ch.is_whitespace()).count();

        if line_whitespace != line.len() {
@ -121,16 +161,6 @@ fn dedent_comment(s: &str) -> String {
        }
    }

-    // maybe the first considered line we found was indented further, so let's look for more lines
-    // that might have a shorter indent. In the case of one line, do nothing.
-    for line in s.lines().skip(1) {
-        let line_whitespace = line.chars().take_while(|ch| ch.is_whitespace()).count();
-
-        if line_whitespace != line.len() {
-            whitespaces = line_whitespace.min(whitespaces);
-        }
-    }
-
    // delete up to `whitespaces` whitespace characters from each line and reconstitute the string
    let mut out = String::new();
    for line in s.lines() {
@ -143,69 +173,163 @@ fn dedent_comment(s: &str) -> String {
    out
 }

-/// Deletes whitespace and leading comment characters
+/// Takes a series of comment and whitespace strings and output a clean single block of text to use
+/// as the output documentation comment block.
 ///
-/// Oversight we are choosing to ignore: if you put # characters at the beginning of lines in a
-/// multiline comment, they will be deleted.
-fn cleanup_comments<S: AsRef<str>, I: DoubleEndedIterator<Item = S>>(comment: &mut I) -> String {
+/// This function expects to be given the tokens in reverse order (proceeding upwards from the
+/// first comment above the definitions), this allows us to properly enforce the below conditions.
+/// The output from this function will be reordered and ready for display.
+///
+/// The two types of documentation comments we expect are:
+///
+/// - A single multiline comment not whitespace separated from the start.
+/// - A series of back to back single line comments not separated by whitespace.
+///
+/// Any other combination will be filtered out.
+///
+/// Once an empty line is encountered, we know no more valid documentation comments remain and stop.
+fn cleanup_comments<I: Iterator<Item = DocToken>>(tokens: &mut I) -> String {
+    // Keep track of when we've found a single line and multiline comment, we use this to
+    // only process a single multiline or back to back single lines.
+    let mut found_single_line = false;
+
+    // Comments that have survived our filtering phase and should be cleaned up.
+    let mut valid = vec![];
+
+    // Filter out comments that don't meet the characteristics of documentation comments.
+    for tok in tokens {
+        if has_empty_line(&tok) {
+            // Take tokens until we hit whitespace containing an empty line.
+            break;
+        }
+
+        // Only care about comments from this point on.
+        if let DocToken::Comment(comment) = tok {
+            // Now determine if it's a single line comment.
+            let is_single_line = comment.starts_with('#');
+
+            // We've found a single line comment if we've found one before or we just found one.
+            found_single_line |= is_single_line;
+
+            // What we do next is only special when we hit a multiline comment.
+            if !is_single_line {
+                // If we've hit a multiline comment as our first comment, take that one alone.
+                if !found_single_line {
+                    // Otherwise we've hit a multiline comment immediately and this is our
+                    // one and only doc comment to worry about.
+                    valid.push(comment);
+                }
+                // Otherwise we've hit a multiline comment after single line comments, in either
+                // case this means we're done processing comments.
+                break;
+            }
+
+            // Otherwise this is a new single line comment to push to the stack.
+            valid.push(comment);
+        }
+    }
+
+    // Cleanup comments for user consumption.
    dedent_comment(
-        &comment
+        &valid
+            .into_iter()
            .rev()
            .map(|small_comment| {
                small_comment
-                    .as_ref()
-                    // space before multiline start
-                    .trim_start()
-                    // multiline starts
+                    // Trim off start of multiline comments.
                    .trim_start_matches("/*")
-                    // trailing so we can grab multiline end
-                    .trim_end()
-                    // multiline ends
+                    // Trim off end of multiline comments.
                    .trim_end_matches("*/")
-                    // extra space that was in the multiline
+                    // Trim off any internal whitespace that's trapped inside comments themselves.
                    .trim()
+                    // Split comments by newlines to extract lines of multiline comments.
                    .split('\n')
-                    // erase single line comments and such
+                    // Cleanup single line comments and a few more tweaks for multiline comments.
                    .map(cleanup_single_line)
                    .collect::<Vec<_>>()
+                    // Reconstruct the multiline comment's whitespace.
                    .join("\n")
            })
            .collect::<Vec<_>>()
-            .join("\n"),
+            // We've found that when multiple back to back single line comments are used in Nixpkgs,
+            // they make more sense to represent as if someone inserted line breaks into the Markdown
+            // properly, so we join them with linebreaks that markdown will pass through.
+            .join("\n\n"),
    )
 }

-/// Get the docs for a specific function
+/// Get the docs for a specific function.
+// TODO: Improve error reporting?
 pub fn get_function_docs(filename: &str, line: usize, col: usize) -> Option<String> {
    let content = fs::read(filename).ok()?;
-    let decoded = str::from_utf8(&content).ok()?;
+    let decoded = convert_endings(str::from_utf8(&content).ok()?);
    let pos = find_pos(&decoded, line, col);
-    let rowan_pos = TextUnit::from_usize(pos);
-    let tree = rnix::parse(decoded);
+    let rowan_pos = rnix::TextSize::from(pos as u32);

+    // The minimum length of a lambda is 4 characters and thus the range we're looking for must be
+    // at least 4 characters long `_: 3` being an example of a minimal length lambda.
+    let rowan_range = rnix::TextRange::at(rowan_pos, 4.into());
+
+    // Parse the file  using rnix.
+    let root = rnix::Root::parse(&decoded).ok().ok()?;
+
+    // Extract the inner expression that represents the Root node and extract the top level expression.
+    let expr = root.expr()?;
+
+    // There are two cases we have to be able to handle
+    // 1. A straightforward definition with an attrset binding to a lambda that's defined inline.
+    // 2. A lambda defined in a standalone file where the attrset binding imports that file directly.
+    // The latter case will not be able to find the binding so we must be able to handle not finding it.
+
+    // Find the deepest node or token that covers the position given by Lix.
+    let covering = expr.syntax().covering_element(rowan_range);
+
+    // Climb up until we find the lambda node that contains that token.
    let mut lambda = None;
-    for node in tree.node().preorder() {
-        match node {
-            WalkEvent::Enter(n) => {
-                if n.text_range().start() >= rowan_pos && n.kind() == NODE_LAMBDA {
-                    lambda = Lambda::cast(n);
+    for ancestor in covering.ancestors() {
+        if ancestor.kind() == SyntaxKind::NODE_LAMBDA {
+            lambda = Some(ancestor);
            break;
        }
    }
-            WalkEvent::Leave(_) => (),
+
+    // There is literally always a lambda or something has gone very very wrong.
+    let lambda =
+        ast::Lambda::cast(
+            lambda.expect("no lambda found; what.")
+        ) .expect("not a rnix::ast::Lambda; what.");
+
+    // Search up, hopefully to find the binding so we can get the identifier name.
+    // TODO: Just provide this directly from the C++ code to make it possible to always have the correct identifier.
+    let mut binding = None;
+    for ancestor in lambda.syntax().ancestors() {
+        if ancestor.kind() == SyntaxKind::NODE_ATTRPATH_VALUE {
+            binding = Some(ancestor);
        }
    }
-    let lambda = lambda?;
-    let res = visit_lambda("func".to_string(), &lambda);
-    Some(res.format(filename, line))
-}

-fn visit_lambda(name: String, lambda: &Lambda) -> SearchResult {
+    // Convert the binding to an identifier if it was found, otherwise use a placeholder.
+    let identifier;
+    identifier = match binding.clone() {
+        Some(binding) => ast::AttrpathValue::cast(binding)
+            .expect("not an rnix::ast::AttrpathValue; what")
+            .attrpath()
+            .expect("AttrpathValue has no attrpath; what.")
+            .to_string(),
+        _ => "<unknown binding>".to_string(),
+    };
+
+    // Find all the comments on the binding or the lambda if we have to fall back.
+    let comment_node = binding.as_ref().unwrap_or(lambda.syntax());
+    let comment = find_comment(comment_node).unwrap_or_else(String::new);
+
+    // And display them properly for the markdown function in Lix.
+    Some(visit_lambda(identifier, comment, &lambda).format(filename, line))
+}
+
+fn visit_lambda(name: String, comment: String, lambda: &Lambda) -> SearchResult {
    // grab the arguments
-    let param_block = pprint_args(&lambda);
-
-    // find the doc comment
-    let comment = find_comment(lambda.node().clone()).unwrap_or_else(|| "".to_string());
+    let param_block = pprint_args(lambda);

    SearchResult {
        identifier: name,
@ -214,39 +338,47 @@ fn visit_lambda(name: String, lambda: &Lambda) -> SearchResult {
    }
 }

-fn find_comment(node: SyntaxNode) -> Option<String> {
-    let mut node = NodeOrToken::Node(node);
-    let mut comments = Vec::new();
-    loop {
-        loop {
-            if let Some(new) = node.prev_sibling_or_token() {
-                node = new;
-                break;
-            } else {
-                node = NodeOrToken::Node(node.parent()?);
-            }
+fn find_comment(node: &SyntaxNode) -> Option<String> {
+    let mut it = node
+        .siblings_with_tokens(rowan::Direction::Prev)
+        // Skip ourselves as we're always the first token returned.
+        .skip(1)
+        .peekable();
+
+    // Consume up to one whitespace token before the first comment. There might not always be
+    // whitespace such as the (rather unusual) case of `/* meow */x = a: 3`.
+    if matches!(it.peek(), Some(NodeOrToken::Token(token)) if token.kind() == SyntaxKind::TOKEN_WHITESPACE) {
+        it.next();
    }

-        match node.kind() {
-            TOKEN_COMMENT => match &node {
-                NodeOrToken::Token(token) => comments.push(token.text().clone()),
-                NodeOrToken::Node(_) => unreachable!(),
-            },
-            // This stuff is found as part of `the-fn = f: ...`
-            // here:                           ^^^^^^^^
-            NODE_KEY | TOKEN_ASSIGN => (),
-            t if t.is_trivia() => (),
-            _ => break,
+    let comments = it.map_while(|element| match element {
+            NodeOrToken::Token(token) => {
+                match token.kind() {
+                    // Map the tokens we're interested in to our internal token type.
+                    SyntaxKind::TOKEN_COMMENT => Some(DocToken::Comment(token.text().to_owned())),
+                    SyntaxKind::TOKEN_WHITESPACE => {
+                        Some(DocToken::Whitespace(token.text().to_owned()))
+                    }
+                    // If we hit a different token type, we know we've gone past relevant comments
+                    // and should stop.
+                    _ => None,
                }
            }
-    let doc = cleanup_comments(&mut comments.iter().map(|c| c.as_str()));
-    Some(doc).filter(|it| !it.is_empty())
+            // If we hit a node entry we've definitely gone past comments that would be related to
+            // this node and we should retreat.
+            _ => None,
+        });
+
+    // For the curious, `into_iter()` here consumes the binding producing an owned value allowing us to avoid
+    // making the original binding mutable, we don't reuse it later so this is a cute way to handle it, though
+    // there's probably a better way we just can't remember.
+    Some(cleanup_comments(&mut comments.into_iter())).filter(|c| !c.is_empty())
 }

 /// Get the docs for a function in the given file path at the given file position and return it as
 /// a C string pointer
 #[no_mangle]
-pub extern "C" fn nd_get_function_docs(
+pub extern "C" fn lixdoc_get_function_docs(
    filename: *const c_char,
    line: usize,
    col: usize,
@ -269,9 +401,9 @@ pub extern "C" fn nd_get_function_docs(
        .unwrap_or(ptr::null())
 }

-/// Call this to free a string from nd_get_function_docs
+/// Call this to free a string from `lixdoc_get_function_docs`.
 #[no_mangle]
-pub extern "C" fn nd_free_string(s: *const c_char) {
+pub extern "C" fn lixdoc_free_string(s: *const c_char) {
    unsafe {
        // cast note: this cast is turning something that was cast to const
        // back to mut
@ -283,35 +415,57 @@ pub extern "C" fn nd_free_string(s: *const c_char) {
 mod tests {
    use super::*;

+    #[test]
+    fn test_line_conversion() {
+        let fakefile = "abc\rdef\r\nghi";
+        assert_eq!(convert_endings(fakefile), "abc\ndef\nghi");
+    }
+
    #[test]
    fn test_bytepos() {
        let fakefile = "abc\ndef\nghi";
        assert_eq!(find_pos(fakefile, 2, 2), 5);
    }

+    #[test]
+    fn test_bytepos_unusual() {
+        let fakefile = convert_endings("abc\rdef\r\nghi");
+        assert_eq!(find_pos(&fakefile, 2, 2), 5);
+        assert_eq!(find_pos(&fakefile, 3, 2), 9);
+    }
+
+    /// This test is to check that we correctly resolve byte positions even when inconsistent with
+    /// character positions.
    #[test]
    fn test_bytepos_cursed() {
-        let fakefile = "abc\rdef\r\nghi";
-        assert_eq!(find_pos(fakefile, 2, 2), 5);
-        assert_eq!(find_pos(fakefile, 3, 2), 10);
+        let fakefile = "hello\nwórld";
+        // Try to find the position of the `r` after world, which will be wrong if we don't handle
+        // UTF-8 properly.
+        let pos = find_pos(&fakefile, 2, 4);
+        dbg!(&fakefile[pos..]);
+        assert_eq!(pos, 9)
    }

    #[test]
    fn test_comment_stripping() {
-        let ex1 = ["/* blah blah blah\n      foooo baaar\n   blah */"];
+        let ex1 = [DocToken::Comment(
+            "/* blah blah blah\n      foooo baaar\n   blah */".to_string(),
+        )];
        assert_eq!(
-            cleanup_comments(&mut ex1.iter()),
+            cleanup_comments(&mut ex1.into_iter()),
            "blah blah blah\n      foooo baaar\n   blah"
        );

-        let ex2 = ["# a1", "#    a2", "# aa"];
-        assert_eq!(cleanup_comments(&mut ex2.iter()), "aa\n   a2\na1");
+        let ex2 = ["# a1", "#    a2", "# aa"]
+            .into_iter()
+            .map(|s| DocToken::Comment(s.to_string()));
+        assert_eq!(cleanup_comments(&mut ex2.into_iter()), "aa\n\n   a2\n\na1");
    }

    #[test]
    fn test_dedent() {
        let ex1 = "a\n   b\n   c\n     d";
-        assert_eq!(dedent_comment(ex1), "a\nb\nc\n  d");
+        assert_eq!(dedent_comment(ex1), ex1);
        let ex2 = "a\nb\nc";
        assert_eq!(dedent_comment(ex2), ex2);
        let ex3 = "   a\n   b\n\n     c";
@ -335,4 +489,31 @@ mod tests {
        let ex1 = "   **Foo**:";
        assert_eq!(cleanup_single_line(ex1), ex1);
    }
+
+    // TODO: Next CL
+    //#[test]
+    //fn comment_test_complex() {
+    //    let testcase = r#"
+    //    rec {
+    //        /*
+    //           Hello
+    //           23
+    //             This is a comment.
+    //             this is another comment.
+    //             and this is a third comment.
+    //                          Way
+    //              go
+    //        */
+    //        meow = { g }: {a, b ? 4, ...}: g: c: 5;
+    //        # And another comment.
+    //        cat = 34;
+    //        # inner layer.
+    //        "inner-layer" = outer: meow;
+    //    }
+    //    "#;
+    //    // Need to find the location of the lambda, we do a quick hack.
+    //    let location = dbg!(testcase.find("{ g }").unwrap() as u32);
+    //
+    //    //get_function_docs(filename, line, col)
+    //}
 }
--- a/lix-doc/src/pprint.rs
+++ b/lix-doc/src/pprint.rs
@ -1,38 +1,61 @@
 // SPDX-FileCopyrightText: 2024 Jade Lovelace
-//
+// SPDX-FileCopyrightText: 2024 Lunaphied
 // SPDX-License-Identifier: BSD-2-Clause OR MIT

-use rnix::types::{Lambda, TypedNode};
-use rnix::SyntaxKind::*;
+use rnix::ast::{Expr, Lambda};
+use rowan::ast::AstNode;

 /// Pretty-prints the arguments to a function
 pub fn pprint_args(lambda: &Lambda) -> String {
    // TODO: handle docs directly on NODE_IDENT args (uncommon case)
    let mut lambda = lambda.clone();
+    let mut depth = 0;
    let mut out = String::new();
    loop {
-        let arg = lambda.arg().unwrap();
-        match arg.kind() {
-            NODE_IDENT => {
-                out += &format!("*{}*", &arg.to_string());
+        let arg = lambda.param().unwrap();
+        for child in arg.syntax().children_with_tokens() {
+            //dbg!(child.kind());
+            match child {
+                rowan::NodeOrToken::Node(node) => {
+                    out.push_str(&node.text().to_string());
+                    if node.kind() == rnix::SyntaxKind::NODE_PAT_ENTRY {
+                        out.push_str(&",\n");
+                    }
+                }
+                rowan::NodeOrToken::Token(token) => {
+                    use rnix::SyntaxKind::{
+                        TOKEN_COMMENT, TOKEN_ELLIPSIS, TOKEN_L_BRACE, TOKEN_QUESTION, TOKEN_R_BRACE,
+                    };
+                    match token.kind() {
+                        TOKEN_COMMENT | TOKEN_ELLIPSIS | TOKEN_QUESTION | TOKEN_L_BRACE
+                        | TOKEN_R_BRACE => {
+                            //dbg!(&token);
+                            out.push_str(&token.text().to_string());
+                            if token.kind() == TOKEN_COMMENT {
+                                out.push('\n');
+                            }
+                        }
+                        _ => {}
+                    }
+                    //out.push_str(&token.text().to_string());
+                }
+            }
+        }
        out.push_str(": ");
        let body = lambda.body().unwrap();
-                if body.kind() == NODE_LAMBDA {
-                    lambda = Lambda::cast(body).unwrap();
+        if let Expr::Lambda(inner) = body {
+            lambda = inner;
+            // If we recurse we want the next line of recursion to be indented and on a new line.
+            out.push('\n');
+            for _ in 0..=depth {
+                out.push('\t');
+            }
+            depth += 1;
        } else {
+            // If we don't find an inner lambda we're done with argument handling.
            break;
        }
    }
-            NODE_PATTERN => {
-                out += &format!("*{}*", &arg.to_string());
-                out.push_str(": ");
-                break;
-            }
-            t => {
-                unreachable!("unhandled arg type {:?}", t);
-            }
-        }
-    }
    out.push_str("...");
    out

--- a/package.nix
+++ b/package.nix
@ -474,6 +474,14 @@ stdenv.mkDerivation (finalAttrs: {
              # Load-bearing order. Must come before clang-unwrapped below, but after clang_tools above.
              stdenv.cc
            ]
+            ++ [
+              pkgs.rust-analyzer
+              pkgs.cargo
+              pkgs.rustc
+              pkgs.rustfmt
+              pkgs.rustPlatform.rustLibSrc
+              pkgs.rustPlatform.rustcSrc
+            ]
            ++ lib.optionals stdenv.cc.isClang [
              # Required for clang-tidy checks.
              llvmPackages.llvm
--- a/src/libcmd/repl.cc
+++ b/src/libcmd/repl.cc
@ -40,24 +40,24 @@
 #include <gc/gc_cpp.h>
 #endif

-// XXX: These are for nix-doc features and will be removed in a future rewrite where this functionality is integrated more natively.
+// XXX: These are for lix-doc features and will be removed in a future rewrite where this functionality is integrated more natively.
 extern "C" {
-    char const *nd_get_function_docs(char const *filename, size_t line, size_t col);
-    void nd_free_string(char const *str);
+    char const *lixdoc_get_function_docs(char const *filename, size_t line, size_t col);
+    void lixdoc_free_string(char const *str);
 }

 namespace nix {


 /** Wrapper around std::unique_ptr with a custom deleter for strings from nix-doc **/
-using NdString = std::unique_ptr<const char, decltype(&nd_free_string)>;
+using NdString = std::unique_ptr<const char, decltype(&lixdoc_free_string)>;

 /**
 * Fetch a string representing the doc comment using nix-doc and wrap it in an RAII wrapper.
 */
 NdString lambdaDocsForPos(SourcePath const path, nix::Pos const &pos) {
  std::string const file = path.to_string();
-  return NdString{nd_get_function_docs(file.c_str(), pos.line, pos.column), &nd_free_string};
+  return NdString{lixdoc_get_function_docs(file.c_str(), pos.line, pos.column), &lixdoc_free_string};
 }

 /**