htmlArea

A directory of browser-based WYSIWYG editors

  MAIN
INDEX
SEARCH
POSTS
WHO'S
ONLINE
LOG
IN

Home: htmlArea 3 (beta): htmlArea 2 & 3 archive (read only): htmlArea v3.0 - Discussion:
Word cleaner that only cleans pasted in text


The htmlArea 2 & 3 editors have been discontinued.

We've made these forums available as a read-only reference and knowledge-base for people using or developing editors based on htmlArea 2 or 3.

Anyone who is interested in taking over version 2 or 3 is free to do so. All we ask is that you choose a new name that doesn't have "htmlarea" in it to avoid confusion with this site. We'll even give you a link in the directory to make it easier for people to find you. If you are developing or hosting an htmlArea based-editor under a new name, please submit it to our directory.

 


monkey3203
New User

Oct 19, 2004, 7:58 PM

Post #1 of 3 (4818 views)
Shortcut
Word cleaner that only cleans pasted in text Can't Post

After quite a bit of fiddling, I've figured out a way to get the word cleaner to only strip things from newly pasted in text, so that text already in the window doesn't lose formatting.

I have commented out a couple lines of code here, but basically the change I've made is to use regular expressions to grab the text that looks like it comes from word. I have the code work on that then search/replace for the old text with the stripped down text.

HTMLArea.prototype._wordClean = function() {
// return;
var mainD = this.getInnerHTML();
//if (D.indexOf('class=Mso') >= 0) {
if ( (mainD.indexOf('class=Mso') >= 0) || (mainD.indexOf('class="Mso') >= 0) ) {
RegExp.lastIndex=0;
var re=new RegExp("<P class=Mso[^>]*>(.*?)</P>");
var res=re.exec(mainD);
if (res == null) {
var re=new RegExp("<P class=\"Mso[^>]*>(.*?)</P>");
var res=re.exec(mainD);
}

D = res[0];

// alert(D.length);
// make one line
/* D = D.replace(/\r\n/g, ' ').
replace(/\n/g, ' ').
replace(/\r/g, ' ').
replace(/\&nbsp\;/g,' ');
*/


// keep tags, strip attributes
D = D.replace(/ class=[^\s|>]*/gi,'').
//replace(/<p [^>]*TEXT-ALIGN: justify[^>]*>/gi,'<p align="justify">').
replace(/ style=\"[^>]*\"/gi,'').
replace(/ align=[^\s|>]*/gi,'');


//clean up tags
D = D.replace(/<b [^>]*>/gi,'<b>').
replace(/<i [^>]*>/gi,'<i>').
replace(/<li [^>]*>/gi,'<li>').
replace(/<ul [^>]*>/gi,'<ul>');



// replace outdated tags
D = D.replace(/<b>/gi,'<strong>').
replace(/<\/b>/gi,'</strong>');

// mozilla doesn't like <em> tags
D = D.replace(/<em>/gi,'<i>').
replace(/<\/em>/gi,'</i>');

// kill unwanted tags
D = D.replace(/<\?xml:[^>]*>/g, ''). // Word xml
replace(/<\/?st1:[^>]*>/g,''). // Word SmartTags
replace(/<\/?[a-z]\:[^>]*>/g,''). // All other funny Word non-HTML stuff
replace(/<\/?font[^>]*>/gi,''). // Disable if you want to keep font formatting
replace(/<\/?span[^>]*>/gi,' ').
replace(/<\/?div[^>]*>/gi,' ').
replace(/<\/?pre[^>]*>/gi,' ').
replace(/<\/?h[1-6][^>]*>/gi,' ');

//remove empty tags
//D = D.replace(/<strong><\/strong>/gi,'').
//replace(/<i><\/i>/gi,'').
//replace(/<P[^>]*><\/P>/gi,'');

// nuke double tags
oldlen = D.length + 1;
while(oldlen > D.length) {
oldlen = D.length;
// join us now and free the tags, we'll be free hackers, we'll be free... ;-)
D = D.replace(/<([a-z][a-z]*)> *<\/\1>/gi,' ').
replace(/<([a-z][a-z]*)> *<([a-z][^>]*)> *<\/\1>/gi,'<$2>');
}
D = D.replace(/<([a-z][a-z]*)><\1>/gi,'<$1>').
replace(/<\/([a-z][a-z]*)><\/\1>/gi,'<\/$1>');

// nuke double spaces
D = D.replace(/ */gi,' ');

mainD = mainD.replace(/<P class=Mso[^>]*>(.*?)<\/P>/gi, D);
mainD = mainD.replace(/<P class=\"Mso[^>]*>(.*?)<\/P>/gi, D);
this.setHTML(mainD);
this.updateToolbar();
}
};


mikeyb101
Novice

Dec 8, 2004, 1:22 PM

Post #2 of 3 (4263 views)
Shortcut
Re: [monkey3203] Word cleaner that only cleans pasted in text [In reply to] Can't Post

Hi

I had a few problems with this so I've tweaked it and works fine now. It may have been buggy cos my htmlarea is in a table cell.


Code
   



HTMLArea.prototype._wordClean = function() {
// return;
var mainD = this.getInnerHTML();
if ( (mainD.indexOf('class=Mso') >= 0) || (mainD.indexOf('class="Mso') >= 0) ) {
RegExp.lastIndex=0;
var re=new RegExp("<P class=Mso[^>]*>(.*?)</P>");
var res=re.exec(mainD);
if (res == null) {
var re=new RegExp("<P class=\
"Mso[^>]*>(.*?)</P>");
var res=re.exec(mainD);
}

D = res;

//alert(D.length);
// make one line
// D = D.replace(/\r\n/g, ' ').
//replace(/\n/g, ' ').
//replace(/\r/g, ' ').
//replace(/\&nbsp\;/g,' ');
//*/

// keep tags, strip attributes
D = D.replace(/ class=[^\s|>]*/gi,'').
//replace(/<p [^>]*TEXT-ALIGN: justify[^>]*>/gi,'<p align=
"justify">').
replace(/ style=\"[^>]*\"/gi,'').
replace(/ align=[^\s|>]*/gi,'');


//clean up tags
D = D.replace(/<b [^>]*>/gi,'<b>').
replace(/<i [^>]*>/gi,'<i>').
replace(/<li [^>]*>/gi,'<li>').
replace(/<ul [^>]*>/gi,'<ul>');



// replace outdated tags
//D = D.replace(/<b>/gi,'<strong>').
//replace(/<\/b>/gi,'</strong>');

// mozilla doesn't like <em> tags
D = D.replace(/<em>/gi,'<i>').
replace(/<\/em>/gi,'</i>');

// kill unwanted tags
D = D.replace(/<\?xml:[^>]*>/g, ''). // Word xml
replace(/<\/?st1:[^>]*>/g,''). // Word SmartTags
replace(/<\/?[a-z]\:[^>]*>/g,''). // All other funny Word non-HTML stuff
replace(/<\/?font[^>]*>/gi,''). // Disable if you want to keep font formatting
replace(/<\/?span[^>]*>/gi,' ').
replace(/<\/?div[^>]*>/gi,' ').
replace(/<\/?pre[^>]*>/gi,' ').
replace(/<\/?h[
1-6][^>]*>/gi,' ');

//remove empty tags
D = D.replace(/<strong><\/strong>/gi,'').
replace(/<i><\/i>/gi,'').
replace(/<P[^>]*><\/P>/gi,'');

// nuke double tags
oldlen = D.length + 1;
while(oldlen > D.length) {
oldlen = D.length;
// join us now and free the tags, we'll be free hackers, we'll be free... ;-)
D = D.replace(/<([a-z][a-z]*)> *<\/\
1>/gi,' ').
replace(/<([a-z][a-z]*)> *<([a-z][^>]*)> *<\/\
1>/gi,'<$2>');
}
D = D.replace(/<([a-z][a-z]*)><\
1>/gi,'<$1>').
replace(/<\/([a-z][a-z]*)><\/\
1>/gi,'<\/$1>');

// nuke double spaces
//D = D.replace(/ */gi,' ');


mainD = D.replace(/<P class=Mso[^>]*>(.*?)<\/P>/gi, D);
mainD = D.replace(/<P class=\
"Mso[^>]*>(.*?)<\/P>/gi, D);
this.setHTML(mainD);
this.updateToolbar();
}
};



mikeyb101
Novice

Jan 17, 2005, 3:42 PM

Post #3 of 3 (3767 views)
Shortcut
Re: [mikeyb101] Word cleaner that only cleans pasted in text [In reply to] Can't Post

Hi

I've been playing around with the word cleaner again and come up with this that cleans only pasted word text:

HTMLArea.prototype._wordClean = function() {
var D = this.getInnerHTML();
rExp = /<P class=Mso/gi;
results = D.search(rExp);
var lastPos1 = D.lastIndexOf(
"mso");
var lastPos2 = D.lastIndexOf(
"Mso");
var finalPos;
if (lastPos1 > lastPos2) {
finalPos = lastPos1
}else{
finalPos = lastPos2
}
var D1 = D.substr(0,results-1);
var D2 = D.substr(results,finalPos-results);
var D3temp = D.substr(finalPos,D.length);
newD3 = D3temp.search("</P>");
var D2a = D3temp.substr(0,newD3);
D2 = D2.concat(D2a);
var D3 = D3temp.substr(newD3,D3temp.length);
if ( (D2.indexOf('class=Mso') >= 0) || (D2.indexOf('class="Mso') >= 0) || (D2.indexOf('mso') >= 0) || (D2.indexOf('Mso') >= 0) ) {

// make one line
D2 = D2.replace(/\r\n/g, ' ').
replace(/\n/g, ' ').
replace(/\r/g, ' ').
replace(/\
&nbsp\;/g,' ');

// keep tags, strip attributes
D2 = D2.replace(/ class=[^\s|>]*/gi,'').
//replace(/<p [^>]*TEXT-ALIGN: justify[^>]*>/gi,'<p align=
"justify">').
replace(/ style=\"[^>]*\"/gi,'').
replace(/ align=[^\s|>]*/gi,'');

//clean up tags
D2 = D2.replace(/<b [^>]*>/gi,'<b>').
replace(/<i [^>]*>/gi,'<i>').
replace(/<li [^>]*>/gi,'<li>').
replace(/<ul [^>]*>/gi,'<ul>');

// replace outdated tags
D2 = D2.replace(/<b>/gi,'<strong>').
replace(/<\/b>/gi,'</strong>');

// mozilla doesn't like <em> tags
D2 = D2.replace(/<em>/gi,'<i>').
replace(/<\/em>/gi,'</i>');

// kill unwanted tags
D2 = D2.replace(/<\?xml:[^>]*>/g, ''). // Word xml
replace(/<\/?st1:[^>]*>/g,''). // Word SmartTags
replace(/<\/?[a-z]\:[^>]*>/g,''). // All other funny Word non-HTML stuff
replace(/<\/?font[^>]*>/gi,''). // Disable if you want to keep font formatting
replace(/<\/?span[^>]*>/gi,' ').
replace(/<\/?div[^>]*>/gi,' ').
replace(/<\/?pre[^>]*>/gi,' ').
replace(/<\/?h[
1-6][^>]*>/gi,' ');

//remove empty tags
D2 = D2.replace(/<strong><\/strong>/gi,'').
replace(/<i><\/i>/gi,'').
replace(/<P[^>]*><\/P>/gi,'');

// nuke double tags
oldlen = D2.length + 1;
while(oldlen > D2.length) {
oldlen = D2.length;
// join us now and free the tags, we'll be free hackers, we'll be free... ;-)
D2 = D2.replace(/<([a-z][a-z]*)> *<\/\
1>/gi,' ').
replace(/<([a-z][a-z]*)> *<([a-z][^>]*)> *<\/\
1>/gi,'<$2>');
}
D2 = D2.replace(/<([a-z][a-z]*)><\
1>/gi,'<$1>').
replace(/<\/([a-z][a-z]*)><\/\
1>/gi,'<\/$1>');

// nuke double spaces
D2 = D2.replace(/ */gi,' ');
var newD = D1.concat(D2);
D = newD.concat(D3);
this.setHTML(D);
this.updateToolbar();
}
};


Maybe not the most efficient but it works for me, should work in mozilla too...
Comments most welcome!



(This post was edited by mikeyb101 on Jan 17, 2005, 3:43 PM)

 
 
 


Search for (options)