Obj-C: Strip HTML tags from an NSString

As it turns out there is no single one liner for striping HTML tags from an NSString in the iOS SDK. So I’ve written the following NSString category that adds a stripHtml method to all NSStrings. To use this include the .h and .m file into your project and:

Import the header:

#import "NSString_stripHtml.h"

And then call stripHtml:

NSString* mystring = @"<b>Hello</b> World!!";
NSString* stripped = [mystring stripHtml];
// stripped will be = Hello World!!

Get the code:

//  NSString_stripHtml.h
//  Copyright 2011 Leigh McCulloch. Released under the MIT license.

#import <Foundation/Foundation.h>

@interface NSString (stripHtml)
- (NSString*)stripHtml;
@end

//  NSString_stripHtml.m
//  Copyright 2011 Leigh McCulloch. Released under the MIT license.

#import "NSString_stripHtml.h"

@interface NSString_stripHtml_XMLParsee : NSObject<NSXMLParserDelegate> {
@private
    NSMutableArray* strings;
}
- (NSString*)getCharsFound;
@end

@implementation NSString_stripHtml_XMLParsee
- (id)init {
    if((self = [super init])) {
        strings = [[NSMutableArray alloc] init];
    }
    return self;
}
- (void)dealloc {
    [strings release];
    [super dealloc];
}
- (void)parser:(NSXMLParser*)parser foundCharacters:(NSString*)string {
    [strings addObject:string];
}
- (NSString*)getCharsFound {
    return [strings componentsJoinedByString:@""];
}
@end

@implementation NSString (stripHtml)
- (NSString*)stripHtml {
    // take this string obj and wrap it in a root element to ensure only a single root element exists
    // and that any ampersands are escaped to preserve the escaped sequences
    NSString* string = [self stringByReplacingOccurrencesOfString:@"&" withString:@"&amp;"];
    string = [NSString stringWithFormat:@"<root>%@</root>", string];
    
    // add the string to the xml parser
    NSStringEncoding encoding = string.fastestEncoding;
    NSData* data = [string dataUsingEncoding:encoding];
    NSXMLParser* parser = [[NSXMLParser alloc] initWithData:data];
    
    // parse the content keeping track of any chars found outside tags (this will be the stripped content)
    NSString_stripHtml_XMLParsee* parsee = [[NSString_stripHtml_XMLParsee alloc] init];
    parser.delegate = parsee;
    [parser parse];
    
    // log any errors encountered while parsing
    //NSError * error = nil;
    //if((error = [parser parserError])) {
    //    NSLog(@"This is a warning only. There was an error parsing the string to strip HTML. This error may be because the string did not contain valid XML, however the result will likely have been decoded correctly anyway.: %@", error);
    //}
    
    // any chars found while parsing are the stripped content
    NSString* strippedString = [parsee getCharsFound];
    
    // clean up
    [parser release];
    [parsee release];
    
    // get the raw text out of the parsee after parsing, and return it
    return strippedString;
}
@end

Get the code above, or from my GitHub Gist.

Comments

comments powered by Disqus